improved word extraction a bit (unicode punctuation)
This commit is contained in:
parent
40a5905b15
commit
d42db8b65d
@ -1,13 +1,15 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.9 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.10 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#ifndef TEST_TEXTSPLIT
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <set>
|
||||
#include "textsplit.h"
|
||||
#include "debuglog.h"
|
||||
#include "utf8iter.h"
|
||||
#include "uproplist.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -37,6 +39,8 @@ using namespace std;
|
||||
// once.
|
||||
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
|
||||
static int charclasses[256];
|
||||
|
||||
static set<unsigned int> unicign;
|
||||
static void setcharclasses()
|
||||
{
|
||||
static int init = 0;
|
||||
@ -67,6 +71,8 @@ static void setcharclasses()
|
||||
|
||||
init = 1;
|
||||
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
|
||||
for (i = 0; i < sizeof(uniign); i++)
|
||||
unicign.insert(uniign[i]);
|
||||
}
|
||||
|
||||
// Do some cleanup (the kind which is simpler to do here than in the main loop,
|
||||
@ -152,6 +158,22 @@ bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline int whatcc(unsigned int c)
|
||||
{
|
||||
int cc;
|
||||
if (c <= 127) {
|
||||
cc = charclasses[c];
|
||||
} else {
|
||||
if (c == (unsigned int)-1)
|
||||
cc = SPACE;
|
||||
else if (unicign.find(c) != unicign.end())
|
||||
cc = SPACE;
|
||||
else
|
||||
cc = LETTER;
|
||||
}
|
||||
return cc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Splitting a text into terms to be indexed.
|
||||
* We basically emit a word every time we see a separator, but some chars are
|
||||
@ -167,16 +189,21 @@ bool TextSplit::text_to_words(const string &in)
|
||||
bool number = false;
|
||||
int wordpos = 0;
|
||||
int spanpos = 0;
|
||||
unsigned int i;
|
||||
int charpos = 0;
|
||||
Utf8Iter it(in);
|
||||
|
||||
for (i = 0; i < in.length(); i++) {
|
||||
int c = in[i];
|
||||
int cc = charclasses[c];
|
||||
for (; !it.eof(); it++, charpos++) {
|
||||
unsigned int c = *it;
|
||||
if (c == (unsigned int)-1) {
|
||||
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
||||
return false;
|
||||
}
|
||||
int cc = whatcc(c);
|
||||
switch (cc) {
|
||||
case SPACE:
|
||||
SPACE:
|
||||
if (word.length()) {
|
||||
if (!doemit(word, wordpos, span, spanpos, true, i))
|
||||
if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
|
||||
return false;
|
||||
number = false;
|
||||
}
|
||||
@ -186,56 +213,57 @@ bool TextSplit::text_to_words(const string &in)
|
||||
case '-':
|
||||
case '+':
|
||||
if (word.length() == 0) {
|
||||
if (i < in.length() && charclasses[int(in[i+1])] == DIGIT) {
|
||||
if (whatcc(it[charpos+1]) == DIGIT) {
|
||||
number = true;
|
||||
word += c;
|
||||
span += c;
|
||||
word += it;
|
||||
span += it;
|
||||
}
|
||||
} else {
|
||||
if (!doemit(word, wordpos, span, spanpos, false, i))
|
||||
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
|
||||
return false;
|
||||
number = false;
|
||||
span += c;
|
||||
span += it;
|
||||
}
|
||||
break;
|
||||
case '@':
|
||||
if (word.length()) {
|
||||
if (!doemit(word, wordpos, span, spanpos, false, i))
|
||||
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
|
||||
return false;
|
||||
number = false;
|
||||
} else
|
||||
word += c;
|
||||
span += c;
|
||||
word += it;
|
||||
span += it;
|
||||
break;
|
||||
case '\'':
|
||||
if (word.length()) {
|
||||
if (!doemit(word, wordpos, span, spanpos, false, i))
|
||||
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
|
||||
return false;
|
||||
number = false;
|
||||
span += c;
|
||||
span += it;
|
||||
}
|
||||
break;
|
||||
case '.':
|
||||
if (number) {
|
||||
word += c;
|
||||
word += it;
|
||||
} else {
|
||||
//cerr<<"Got . span: '"<<span<<"' word: '"<<word<<"'"<<endl;
|
||||
if (word.length()) {
|
||||
if (!doemit(word, wordpos, span, spanpos, false, i))
|
||||
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
|
||||
return false;
|
||||
number = false;
|
||||
} else
|
||||
word += c;
|
||||
word += it;
|
||||
}
|
||||
span += c;
|
||||
span += it;
|
||||
break;
|
||||
case '#':
|
||||
// Keep it only at end of word...
|
||||
if (word.length() > 0 &&
|
||||
(i == in.length() -1 || charclasses[int(in[i+1])] == SPACE ||
|
||||
in[i+1] == '\n' || in[i+1] == '\r')) {
|
||||
word += c;
|
||||
span += c;
|
||||
(whatcc(it[charpos+1]) == SPACE ||
|
||||
whatcc(it[charpos+1]) == '\n' ||
|
||||
whatcc(it[charpos+1]) == '\r')) {
|
||||
word += it;
|
||||
span += it;
|
||||
}
|
||||
|
||||
break;
|
||||
@ -261,13 +289,13 @@ bool TextSplit::text_to_words(const string &in)
|
||||
else
|
||||
number = false;
|
||||
}
|
||||
word += (char)c;
|
||||
span += (char)c;
|
||||
word += it;
|
||||
span += it;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (word.length()) {
|
||||
if (!doemit(word, wordpos, span, spanpos, true, i))
|
||||
if (span.length()) {
|
||||
if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@ -306,7 +334,8 @@ static string teststring =
|
||||
"192.168.4.1 "
|
||||
"one\n\rtwo\nthree-\nfour "
|
||||
"[olala][ululu] "
|
||||
"'o'brien' "
|
||||
"'o'brien' "
|
||||
"utf-8 ucs-4©"
|
||||
"\n"
|
||||
;
|
||||
|
||||
|
||||
168
src/common/uproplist.h
Normal file
168
src/common/uproplist.h
Normal file
@ -0,0 +1,168 @@
|
||||
#ifndef _PROPLIST_H_INCLUDED_
|
||||
#define _PROPLIST_H_INCLUDED_
|
||||
/* @(#$Id: uproplist.h,v 1.1 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/*
|
||||
* A subset of Unicode chars that we consider whitespace when we split text in
|
||||
* words.
|
||||
|
||||
* This is used as a quick fix to the ascii-based code, and is not correct.
|
||||
* the correct way would be to do what http://www.unicode.org/reports/tr29/
|
||||
* says. We should then convert first to ucs-4, and then strictly use
|
||||
* character properties, which might actually be simpler than the current
|
||||
* solution...
|
||||
*
|
||||
* From:
|
||||
# PropList-4.0.1.txt
|
||||
# Date: 2004-03-02, 02:42:40 GMT [MD]
|
||||
#
|
||||
# Unicode Character Database
|
||||
# Copyright (c) 1991-2004 Unicode, Inc.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
# For documentation, see UCD.html
|
||||
*/
|
||||
|
||||
static const unsigned int uniign[] = {
|
||||
0x0085, /* ; White_Space # Cc <control-0085>*/
|
||||
0x00A0, /* ; White_Space # Zs NO-BREAK SPACE*/
|
||||
0x00A1, /* misc signs, bullet etc... */
|
||||
0x00A2,
|
||||
0x00A3,
|
||||
0x00A4,
|
||||
0x00A5,
|
||||
0x00A6,
|
||||
0x00A9, /* copyright sign */
|
||||
0x00AA,
|
||||
0x00AE, /* registered sign */
|
||||
0x1680, /* ; White_Space # Zs OGHAM SPACE MARK*/
|
||||
0x180E, /* ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR*/
|
||||
0x2000, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
0x2001, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
0x2002, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
0x2003, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
0x2004, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
0x2005, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
0x2006, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
0x2007, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
0x2008, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
0x2009, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
0x200A, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
0x2028, /* ; White_Space # Zl LINE SEPARATOR*/
|
||||
0x2029, /* ; White_Space # Zp PARAGRAPH SEPARATOR*/
|
||||
0x202F, /* ; White_Space # Zs NARROW NO-BREAK SPACE*/
|
||||
0x205F, /* ; White_Space # Zs MEDIUM MATHEMATICAL SPACE*/
|
||||
0x3000, /* ; White_Space # Zs IDEOGRAPHIC SPACE*/
|
||||
0x002D, /* ; Dash # Pd HYPHEN-MINUS*/
|
||||
0x058A, /* ; Dash # Pd ARMENIAN HYPHEN*/
|
||||
0x1806, /* ; Dash # Pd MONGOLIAN TODO SOFT HYPHEN*/
|
||||
0x2010, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||
0x2011, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||
0x2012, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||
0x2013, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||
0x2014, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||
0x2015, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/
|
||||
0x2053, /* ; Dash # Po SWUNG DASH*/
|
||||
0x207B, /* ; Dash # Sm SUPERSCRIPT MINUS*/
|
||||
0x208B, /* ; Dash # Sm SUBSCRIPT MINUS*/
|
||||
0x2212, /* ; Dash # Sm MINUS SIGN*/
|
||||
0x301C, /* ; Dash # Pd WAVE DASH*/
|
||||
0x3030, /* ; Dash # Pd WAVY DASH*/
|
||||
0xFE31, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EM DASH*/
|
||||
0xFE32, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EN DASH*/
|
||||
0xFE58, /* ; Dash # Pd SMALL EM DASH*/
|
||||
0xFE63, /* ; Dash # Pd SMALL HYPHEN-MINUS*/
|
||||
0xFF0D, /* ; Dash # Pd FULLWIDTH HYPHEN-MINUS*/
|
||||
0x00AD, /* ; Hyphen # Cf SOFT HYPHEN*/
|
||||
0x058A, /* ; Hyphen # Pd ARMENIAN HYPHEN*/
|
||||
0x1806, /* ; Hyphen # Pd MONGOLIAN TODO SOFT HYPHEN*/
|
||||
0x2010, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/
|
||||
0x2011, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/
|
||||
0x30FB, /* ; Hyphen # Pc KATAKANA MIDDLE DOT*/
|
||||
0xFE63, /* ; Hyphen # Pd SMALL HYPHEN-MINUS*/
|
||||
0xFF0D, /* ; Hyphen # Pd FULLWIDTH HYPHEN-MINUS*/
|
||||
0xFF65, /* ; Hyphen # Pc HALFWIDTH KATAKANA MIDDLE DOT*/
|
||||
0x00AB, /* ; Quotation_Mark # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK*/
|
||||
0x00BB, /* ; Quotation_Mark # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK*/
|
||||
0x2018, /* ; Quotation_Mark # Pi LEFT SINGLE QUOTATION MARK*/
|
||||
0x2019, /* ; Quotation_Mark # Pf RIGHT SINGLE QUOTATION MARK*/
|
||||
0x201A, /* ; Quotation_Mark # Ps SINGLE LOW-9 QUOTATION MARK*/
|
||||
0x201B, /* ; Quotation_Mark # Pi SINGLE HIGH-REVERSED-9 QUOTATION MARK*/
|
||||
0x201C, /* ; Quotation_Mark # Pi LEFT DOUBLE QUOTATION MARK*/
|
||||
0x201D, /* ; Quotation_Mark # Pf RIGHT DOUBLE QUOTATION MARK*/
|
||||
0x201E, /* ; Quotation_Mark # Ps DOUBLE LOW-9 QUOTATION MARK*/
|
||||
0x201F, /* ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK*/
|
||||
0x2039, /* ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK*/
|
||||
0x203A, /* ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK*/
|
||||
0x300C, /* ; Quotation_Mark # Ps LEFT CORNER BRACKET*/
|
||||
0x300D, /* ; Quotation_Mark # Pe RIGHT CORNER BRACKET*/
|
||||
0x300E, /* ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET*/
|
||||
0x300F, /* ; Quotation_Mark # Pe RIGHT WHITE CORNER BRACKET*/
|
||||
0x301D, /* ; Quotation_Mark # Ps REVERSED DOUBLE PRIME QUOTATION MARK*/
|
||||
0x301E, /* ; Quotation_Mark # Pe DOUBLE PRIME QUOTATION MARK*/
|
||||
0x301E, /* ; Quotation_Mark # Pe LOW DOUBLE PRIME QUOTATION MARK*/
|
||||
0xFE41, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET*/
|
||||
0xFE42, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET*/
|
||||
0xFE43, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET*/
|
||||
0xFE44, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET*/
|
||||
0xFF02, /* ; Quotation_Mark # Po FULLWIDTH QUOTATION MARK*/
|
||||
0xFF07, /* ; Quotation_Mark # Po FULLWIDTH APOSTROPHE*/
|
||||
0xFF62, /* ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET*/
|
||||
0xFF63, /* ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET*/
|
||||
0x0021, /* ; Terminal_Punctuation # Po EXCLAMATION MARK*/
|
||||
0x002C, /* ; Terminal_Punctuation # Po COMMA*/
|
||||
0x002E, /* ; Terminal_Punctuation # Po FULL STOP*/
|
||||
0x003A, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/
|
||||
0x003B, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/
|
||||
0x003F, /* ; Terminal_Punctuation # Po QUESTION MARK*/
|
||||
0x037E, /* ; Terminal_Punctuation # Po GREEK QUESTION MARK*/
|
||||
0x0387, /* ; Terminal_Punctuation # Po GREEK ANO TELEIA*/
|
||||
0x0589, /* ; Terminal_Punctuation # Po ARMENIAN FULL STOP*/
|
||||
0x05C3, /* ; Terminal_Punctuation # Po HEBREW PUNCTUATION SOF PASUQ*/
|
||||
0x060C, /* ; Terminal_Punctuation # Po ARABIC COMMA*/
|
||||
0x061B, /* ; Terminal_Punctuation # Po ARABIC SEMICOLON*/
|
||||
0x061F, /* ; Terminal_Punctuation # Po ARABIC QUESTION MARK*/
|
||||
0x06D4, /* ; Terminal_Punctuation # Po ARABIC FULL STOP*/
|
||||
0x2047, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||
0x2048, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||
0x2049, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||
0xFE50, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
|
||||
0xFE51, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
|
||||
0xFE52, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/
|
||||
0xFE54, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
|
||||
0xFE55, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
|
||||
0xFE56, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
|
||||
0xFE57, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
|
||||
0xFF01, /* ; Terminal_Punctuation # Po FULLWIDTH EXCLAMATION MARK*/
|
||||
0xFF0C, /* ; Terminal_Punctuation # Po FULLWIDTH COMMA*/
|
||||
0xFF0E, /* ; Terminal_Punctuation # Po FULLWIDTH FULL STOP*/
|
||||
0xFF1A, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
|
||||
0xFF1B, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
|
||||
0xFF1F, /* ; Terminal_Punctuation # Po FULLWIDTH QUESTION MARK*/
|
||||
0xFF61, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/
|
||||
0xFF64, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA*/
|
||||
0x0021, /* ; STerm # Po EXCLAMATION MARK*/
|
||||
0x002E, /* ; STerm # Po FULL STOP*/
|
||||
0x003F, /* ; STerm # Po QUESTION MARK*/
|
||||
0x055C, /* ; STerm # Po ARMENIAN EXCLAMATION MARK*/
|
||||
0x055E, /* ; STerm # Po ARMENIAN QUESTION MARK*/
|
||||
0x0589, /* ; STerm # Po ARMENIAN FULL STOP*/
|
||||
0x061F, /* ; STerm # Po ARABIC QUESTION MARK*/
|
||||
0x06D4, /* ; STerm # Po ARABIC FULL STOP*/
|
||||
0x166E, /* ; STerm # Po CANADIAN SYLLABICS FULL STOP*/
|
||||
0x1803, /* ; STerm # Po MONGOLIAN FULL STOP*/
|
||||
0x1809, /* ; STerm # Po MONGOLIAN MANCHU FULL STOP*/
|
||||
0x203C, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
|
||||
0x203D, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
|
||||
0x2047, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||
0x2048, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||
0x2049, /* ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||
0x3002, /* ; STerm # Po IDEOGRAPHIC FULL STOP*/
|
||||
0xFE52, /* ; STerm # Po SMALL FULL STOP*/
|
||||
0xFE56, /* ; STerm # Po SMALL QUESTION MARK*/
|
||||
0xFE57, /* ; STerm # Po SMALL EXCLAMATION MARK*/
|
||||
0xFF01, /* ; STerm # Po FULLWIDTH EXCLAMATION MARK*/
|
||||
0xFF0E, /* ; STerm # Po FULLWIDTH FULL STOP*/
|
||||
0xFF1F, /* ; STerm # Po FULLWIDTH QUESTION MARK*/
|
||||
0xFF61, /* ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/
|
||||
};
|
||||
|
||||
#endif /*PLIST_H_INCLUDED_ */
|
||||
@ -35,7 +35,7 @@ trtranscode.o : ../utils/transcode.cpp
|
||||
|
||||
MIMEPARSE_OBJS= trmimeparse.o $(BIGLIB)
|
||||
trmimeparse : $(MIMEPARSE_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o mimeparse $(MIMEPARSE_OBJS) $(LIBICONV)
|
||||
$(CXX) $(CXXFLAGS) -o trmimeparse $(MIMEPARSE_OBJS) $(LIBICONV)
|
||||
trmimeparse.o : mimeparse.cpp
|
||||
$(CXX) $(CXXFLAGS) -DTEST_MIMEPARSE -c -o trmimeparse.o \
|
||||
mimeparse.cpp
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: utf8iter.cpp,v 1.2 2005-02-11 11:20:02 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include "debuglog.h"
|
||||
using namespace std;
|
||||
|
||||
@ -22,32 +23,61 @@ int main(int argc, char **argv)
|
||||
const char *infile = argv[1];
|
||||
const char *outfile = argv[2];
|
||||
string in;
|
||||
string out;
|
||||
if (!file_to_string(infile, in)) {
|
||||
cerr << "Cant read file\n" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
vector<unsigned int>ucsout1;
|
||||
string out, out1;
|
||||
Utf8Iter it(in);
|
||||
FILE *fp = fopen(outfile, "w");
|
||||
if (fp == 0) {
|
||||
fprintf(stderr, "cant create %s\n", outfile);
|
||||
exit(1);
|
||||
}
|
||||
while (!it.eof()) {
|
||||
int nchars = 0;
|
||||
for (;!it.eof(); it++) {
|
||||
unsigned int value = *it;
|
||||
it.appendchartostring(out);
|
||||
it++;
|
||||
if (value == (unsigned int)-1) {
|
||||
fprintf(stderr, "Conversion error occurred\n");
|
||||
exit(1);
|
||||
}
|
||||
ucsout1.push_back(value);
|
||||
fwrite(&value, 4, 1, fp);
|
||||
if (!it.appendchartostring(out))
|
||||
break;
|
||||
out1 += it;
|
||||
nchars++;
|
||||
}
|
||||
fclose(fp);
|
||||
if (it.error()) {
|
||||
fprintf(stderr, "Conversion error occurred\n");
|
||||
exit(1);
|
||||
}
|
||||
fprintf(stderr, "nchars1 %d\n", nchars);
|
||||
if (in != out) {
|
||||
fprintf(stderr, "error: out != in\n");
|
||||
exit(1);
|
||||
}
|
||||
if (in != out1) {
|
||||
fprintf(stderr, "error: out1 != in\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
vector<unsigned int>ucsout2;
|
||||
it.rewind();
|
||||
for (int i = 0; ; i++) {
|
||||
unsigned int value;
|
||||
if ((value = it[i]) == (unsigned int)-1) {
|
||||
fprintf(stderr, "%d chars\n", i);
|
||||
break;
|
||||
}
|
||||
it++;
|
||||
ucsout2.push_back(value);
|
||||
}
|
||||
|
||||
if (ucsout1 != ucsout2) {
|
||||
fprintf(stderr, "error: ucsout1 != ucsout2\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _UTF8ITER_H_INCLUDED_
|
||||
#define _UTF8ITER_H_INCLUDED_
|
||||
/* @(#$Id: utf8iter.h,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: utf8iter.h,v 1.2 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
/**
|
||||
* A small helper class to iterate over utf8 strings. This is not an
|
||||
@ -8,58 +8,113 @@
|
||||
some specific uses
|
||||
*/
|
||||
class Utf8Iter {
|
||||
unsigned int cl;
|
||||
const string &s;
|
||||
string::size_type pos;
|
||||
bool bad;
|
||||
int compute_cl() {
|
||||
unsigned int cl; // Char length at current position if known
|
||||
const string &s; // String we're working with
|
||||
string::size_type pos; // Current position in string
|
||||
bool bad; // Status
|
||||
unsigned int m_charpos; // Current character posiiton
|
||||
|
||||
// Get character byte length at specified position
|
||||
inline int get_cl(string::size_type p) const {
|
||||
unsigned int z = (unsigned char)s[p];
|
||||
if (z <= 127) {
|
||||
return 1;
|
||||
} else if (z>=192 && z <= 223) {
|
||||
return 2;
|
||||
} else if (z >= 224 && z <= 239) {
|
||||
return 3;
|
||||
} else if (z >= 240 && z <= 247) {
|
||||
return 4;
|
||||
} else if (z >= 248 && z <= 251) {
|
||||
return 5;
|
||||
} else if (z >= 252 && z <= 253) {
|
||||
return 6;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
// Check position and cl against string length
|
||||
bool poslok(string::size_type p, int l) const {
|
||||
return p != string::npos && l > 0 && p + l <= s.length();
|
||||
}
|
||||
// Update current char length in object state. Assumes pos is inside string
|
||||
inline int compute_cl() {
|
||||
cl = 0;
|
||||
if (bad)
|
||||
return -1;
|
||||
unsigned int z = (unsigned char)s[pos];
|
||||
if (z <= 127) {
|
||||
cl = 1;
|
||||
} else if (z>=192 && z <= 223) {
|
||||
cl = 2;
|
||||
} else if (z >= 224 && z <= 239) {
|
||||
cl = 3;
|
||||
} else if (z >= 240 && z <= 247) {
|
||||
cl = 4;
|
||||
} else if (z >= 248 && z <= 251) {
|
||||
cl = 5;
|
||||
} else if (z >= 252 && z <= 253) {
|
||||
cl = 6;
|
||||
}
|
||||
if (!cl || s.length() - pos < cl) {
|
||||
cl = get_cl(pos);
|
||||
if (!poslok(pos, cl)) {
|
||||
bad = true;
|
||||
cl = 0;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
// Compute value at given position
|
||||
inline unsigned int getvalueat(string::size_type p, int l) const {
|
||||
switch (l) {
|
||||
case 1: return (unsigned char)s[p];
|
||||
case 2: return ((unsigned char)s[p] - 192) * 64 +
|
||||
(unsigned char)s[p+1] - 128 ;
|
||||
case 3: return ((unsigned char)s[p]-224)*4096 +
|
||||
((unsigned char)s[p+1]-128)*64 +
|
||||
(unsigned char)s[p+2]-128;
|
||||
case 4: return ((unsigned char)s[p]-240)*262144 +
|
||||
((unsigned char)s[p+1]-128)*4096 +
|
||||
((unsigned char)s[p+2]-128)*64 +
|
||||
(unsigned char)s[p+3]-128;
|
||||
case 5: return ((unsigned char)s[p]-248)*16777216 +
|
||||
((unsigned char)s[p+1]-128)*262144 +
|
||||
((unsigned char)s[p+2]-128)*4096 +
|
||||
((unsigned char)s[p+3]-128)*64 +
|
||||
(unsigned char)s[p+4]-128;
|
||||
case 6: return ((unsigned char)s[p]-252)*1073741824 +
|
||||
((unsigned char)s[p+1]-128)*16777216 +
|
||||
((unsigned char)s[p+2]-128)*262144 +
|
||||
((unsigned char)s[p+3]-128)*4096 +
|
||||
((unsigned char)s[p+4]-128)*64 +
|
||||
(unsigned char)s[p+5]-128;
|
||||
default:
|
||||
return (unsigned int)-1;
|
||||
}
|
||||
}
|
||||
public:
|
||||
Utf8Iter(const string &in) : cl(0), s(in), pos(0), bad(false) {}
|
||||
Utf8Iter(const string &in)
|
||||
: cl(0), s(in), pos(0), bad(false), m_charpos(0) {}
|
||||
|
||||
void rewind() {
|
||||
cl=0; pos=0; bad=false; m_charpos=0;
|
||||
}
|
||||
/** operator* returns the ucs4 value as a machine integer*/
|
||||
unsigned int operator*() {
|
||||
if (!cl && compute_cl() < 0)
|
||||
return (unsigned int)-1;
|
||||
switch (cl) {
|
||||
case 1: return (unsigned char)s[pos];
|
||||
case 2: return ((unsigned char)s[pos] - 192) * 64 + (unsigned char)s[pos+1] - 128 ;
|
||||
case 3: return ((unsigned char)s[pos]-224)*4096 + ((unsigned char)s[pos+1]-128)*64 + (unsigned char)s[pos+2]-128;
|
||||
case 4: return ((unsigned char)s[pos]-240)*262144 + ((unsigned char)s[pos+1]-128)*4096 +
|
||||
((unsigned char)s[pos+2]-128)*64 + (unsigned char)s[pos+3]-128;
|
||||
case 5: return ((unsigned char)s[pos]-248)*16777216 + ((unsigned char)s[pos+1]-128)*262144 +
|
||||
((unsigned char)s[pos+2]-128)*4096 + ((unsigned char)s[pos+3]-128)*64 + (unsigned char)s[pos+4]-128;
|
||||
case 6: return ((unsigned char)s[pos]-252)*1073741824 + ((unsigned char)s[pos+1]-128)*16777216 +
|
||||
((unsigned char)s[pos+2]-128)*262144 + ((unsigned char)s[pos+3]-128)*4096 +
|
||||
((unsigned char)s[pos+4]-128)*64 + (unsigned char)s[pos+5]-128;
|
||||
default:
|
||||
unsigned int val = getvalueat(pos, cl);
|
||||
if (val == (unsigned int)-1) {
|
||||
bad = true;
|
||||
cl = 0;
|
||||
return (unsigned int)-1;
|
||||
}
|
||||
return val;
|
||||
}
|
||||
/** "Direct" access. Awfully inefficient as we skip from start or current
|
||||
* position at best. This can only be useful for a lookahead from the
|
||||
* current position */
|
||||
unsigned int operator[](unsigned int charpos) const {
|
||||
string::size_type mypos = 0;
|
||||
unsigned int mycp = 0;;
|
||||
if (charpos >= m_charpos) {
|
||||
mypos = pos;
|
||||
mycp = m_charpos;
|
||||
}
|
||||
while (mypos < s.length() && mycp != charpos) {
|
||||
mypos += get_cl(mypos);
|
||||
++mycp;
|
||||
}
|
||||
if (mypos < s.length() && mycp == charpos) {
|
||||
int l = get_cl(mypos);
|
||||
if (poslok(mypos, l))
|
||||
return getvalueat(mypos, get_cl(mypos));
|
||||
}
|
||||
return (unsigned int)-1;
|
||||
}
|
||||
|
||||
string::size_type operator++(int) {
|
||||
@ -67,6 +122,7 @@ class Utf8Iter {
|
||||
return string::npos;
|
||||
}
|
||||
pos += cl;
|
||||
m_charpos++;
|
||||
cl = 0;
|
||||
return pos;
|
||||
}
|
||||
@ -78,12 +134,24 @@ class Utf8Iter {
|
||||
out += s.substr(pos, cl);
|
||||
return true;
|
||||
}
|
||||
operator string() {
|
||||
if (bad || (!cl && compute_cl() < 0)) {
|
||||
return false;
|
||||
}
|
||||
return s.substr(pos, cl);
|
||||
}
|
||||
bool eof() {
|
||||
return bad || pos == s.length();
|
||||
}
|
||||
bool error() {
|
||||
return bad;
|
||||
}
|
||||
string::size_type getBpos() const {
|
||||
return pos;
|
||||
}
|
||||
string::size_type getCpos() const {
|
||||
return m_charpos;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
212
src/utils/utf8testin.txt
Normal file
212
src/utils/utf8testin.txt
Normal file
@ -0,0 +1,212 @@
|
||||
|
||||
UTF-8 encoded sample plain-text file
|
||||
‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
|
||||
|
||||
Markus Kuhn [ˈmaʳkʊs kuːn] <http://www.cl.cam.ac.uk/~mgk25/> — 2002-07-25
|
||||
|
||||
|
||||
The ASCII compatible UTF-8 encoding used in this plain-text file
|
||||
is defined in Unicode, ISO 10646-1, and RFC 2279.
|
||||
|
||||
|
||||
Using Unicode/UTF-8, you can write in emails and source code things such as
|
||||
|
||||
Mathematics and sciences:
|
||||
|
||||
∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
|
||||
⎪⎢⎜│a²+b³ ⎟⎥⎪
|
||||
∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
|
||||
⎪⎢⎜⎷ c₈ ⎟⎥⎪
|
||||
ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
|
||||
⎪⎢⎜ ∞ ⎟⎥⎪
|
||||
⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
|
||||
⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
|
||||
2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
|
||||
|
||||
Linguistics and dictionaries:
|
||||
|
||||
ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
|
||||
Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
|
||||
|
||||
APL:
|
||||
|
||||
((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
|
||||
|
||||
Nicer typography in plain text files:
|
||||
|
||||
╔══════════════════════════════════════════╗
|
||||
║ ║
|
||||
║ • ‘single’ and “double” quotes ║
|
||||
║ ║
|
||||
║ • Curly apostrophes: “We’ve been here” ║
|
||||
║ ║
|
||||
║ • Latin-1 apostrophe and accents: '´` ║
|
||||
║ ║
|
||||
║ • ‚deutsche‘ „Anführungszeichen“ ║
|
||||
║ ║
|
||||
║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
|
||||
║ ║
|
||||
║ • ASCII safety test: 1lI|, 0OD, 8B ║
|
||||
║ ╭─────────╮ ║
|
||||
║ • the euro symbol: │ 14.95 € │ ║
|
||||
║ ╰─────────╯ ║
|
||||
╚══════════════════════════════════════════╝
|
||||
|
||||
Combining characters:
|
||||
|
||||
STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
|
||||
|
||||
Greek (in Polytonic):
|
||||
|
||||
The Greek anthem:
|
||||
|
||||
Σὲ γνωρίζω ἀπὸ τὴν κόψη
|
||||
τοῦ σπαθιοῦ τὴν τρομερή,
|
||||
σὲ γνωρίζω ἀπὸ τὴν ὄψη
|
||||
ποὺ μὲ βία μετράει τὴ γῆ.
|
||||
|
||||
᾿Απ᾿ τὰ κόκκαλα βγαλμένη
|
||||
τῶν ῾Ελλήνων τὰ ἱερά
|
||||
καὶ σὰν πρῶτα ἀνδρειωμένη
|
||||
χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
|
||||
|
||||
From a speech of Demosthenes in the 4th century BC:
|
||||
|
||||
Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
|
||||
ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
|
||||
λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
|
||||
τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
|
||||
εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
|
||||
πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
|
||||
οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
|
||||
οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
|
||||
ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
|
||||
τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
|
||||
γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
|
||||
προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
|
||||
σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
|
||||
τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
|
||||
τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
|
||||
τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
|
||||
|
||||
Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
|
||||
|
||||
Georgian:
|
||||
|
||||
From a Unicode conference invitation:
|
||||
|
||||
გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
|
||||
კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
|
||||
ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
|
||||
ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
|
||||
ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
|
||||
ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
|
||||
ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
|
||||
|
||||
Russian:
|
||||
|
||||
From a Unicode conference invitation:
|
||||
|
||||
Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
|
||||
Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
|
||||
Конференция соберет широкий круг экспертов по вопросам глобального
|
||||
Интернета и Unicode, локализации и интернационализации, воплощению и
|
||||
применению Unicode в различных операционных системах и программных
|
||||
приложениях, шрифтах, верстке и многоязычных компьютерных системах.
|
||||
|
||||
Thai (UCS Level 2):
|
||||
|
||||
Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
|
||||
classic 'San Gua'):
|
||||
|
||||
[----------------------------|------------------------]
|
||||
๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
|
||||
สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
|
||||
ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
|
||||
โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
|
||||
เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
|
||||
ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
|
||||
พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
|
||||
ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
|
||||
|
||||
(The above is a two-column text. If combining characters are handled
|
||||
correctly, the lines of the second column should be aligned with the
|
||||
| character above.)
|
||||
|
||||
Ethiopian:
|
||||
|
||||
Proverbs in the Amharic language:
|
||||
|
||||
ሰማይ አይታረስ ንጉሥ አይከሰስ።
|
||||
ብላ ካለኝ እንደአባቴ በቆመጠኝ።
|
||||
ጌጥ ያለቤቱ ቁምጥና ነው።
|
||||
ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
|
||||
የአፍ ወለምታ በቅቤ አይታሽም።
|
||||
አይጥ በበላ ዳዋ ተመታ።
|
||||
ሲተረጉሙ ይደረግሙ።
|
||||
ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
|
||||
ድር ቢያብር አንበሳ ያስር።
|
||||
ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
|
||||
እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
|
||||
የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
|
||||
ሥራ ከመፍታት ልጄን ላፋታት።
|
||||
ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
|
||||
የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
|
||||
ተንጋሎ ቢተፉ ተመልሶ ባፉ።
|
||||
ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
|
||||
እግርህን በፍራሽህ ልክ ዘርጋ።
|
||||
|
||||
Runes:
|
||||
|
||||
ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
|
||||
|
||||
(Old English, which transcribed into Latin reads 'He cwaeth that he
|
||||
bude thaem lande northweardum with tha Westsae.' and means 'He said
|
||||
that he lived in the northern land near the Western Sea.')
|
||||
|
||||
Braille:
|
||||
|
||||
⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
|
||||
|
||||
⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
|
||||
⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
|
||||
⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
|
||||
⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
|
||||
⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
|
||||
⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
|
||||
|
||||
⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
|
||||
|
||||
⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
|
||||
⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
|
||||
⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
|
||||
⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
|
||||
⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
|
||||
⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
|
||||
⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
|
||||
⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
|
||||
⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
|
||||
|
||||
(The first couple of paragraphs of "A Christmas Carol" by Dickens)
|
||||
|
||||
Compact font selection example text:
|
||||
|
||||
ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
|
||||
abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
|
||||
–—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
|
||||
∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi<>⑀₂ἠḂӥẄɐː⍎אԱა
|
||||
|
||||
Greetings in various languages:
|
||||
|
||||
Hello world, Καλημέρα κόσμε, コンニチハ
|
||||
|
||||
Box drawing alignment tests: █
|
||||
▉
|
||||
╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
|
||||
║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
|
||||
║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
|
||||
╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
|
||||
║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
|
||||
║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
|
||||
╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
|
||||
▝▀▘▙▄▟
|
||||
Loading…
x
Reference in New Issue
Block a user