attempt to eliminate more unicode uninteresting characters

This commit is contained in:
Jean-Francois Dockes 2012-10-02 17:45:16 +02:00
parent 4a17bac9e3
commit efd319025d
2 changed files with 88 additions and 193 deletions

View File

@ -62,6 +62,7 @@ static int charclasses[charclasses_size];
// common cases
static unordered_set<unsigned int> unicign;
static unordered_set<unsigned int> visiblewhite;
static vector<unsigned int> vignblocks;
class CharClassInit {
public:
@ -98,6 +99,15 @@ public:
}
unicign.insert((unsigned int)-1);
for (i = 0; i < sizeof(uniignblocks) / sizeof(int); i++) {
vignblocks.push_back(uniignblocks[i]);
}
if (vignblocks.size() % 2) {
LOGFATAL(("Fatal internal error: unicode ign blocks array "
"size not even\n"));
abort();
}
for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
visiblewhite.insert(avsbwht[i]);
}
@ -110,13 +120,35 @@ static inline int whatcc(unsigned int c)
if (c <= 127) {
return charclasses[c];
} else {
if (unicign.find(c) != unicign.end())
if (unicign.find(c) != unicign.end()) {
return SPACE;
else
return LETTER;
} else {
vector<unsigned int>::iterator it =
lower_bound(vignblocks.begin(), vignblocks.end(), c);
if (c == *it)
return SPACE;
if ((it - vignblocks.begin()) % 2 == 1) {
return SPACE;
} else {
return LETTER;
}
}
}
}
// testing whatcc...
#if 0
unsigned int testvalues[] = {'a', '0', 0x80, 0xbf, 0xc0, 0x05c3, 0x1000,
0x2000, 0x2001, 0x206e, 0x206f, 0x20d0, 0x2399,
0x2400, 0x2401, 0x243f, 0x2440, 0xff65};
int ntest = sizeof(testvalues) / sizeof(int);
for (int i = 0; i < ntest; i++) {
int ret = whatcc(testvalues[i]);
printf("Tested value 0x%x, returned value %d %s\n",
testvalues[i], ret, ret == LETTER ? "LETTER" :
ret == SPACE ? "SPACE" : "OTHER");
}
#endif
// CJK Unicode character detection:
//

View File

@ -26,27 +26,61 @@
* the correct way would be to do what http://www.unicode.org/reports/tr29/
* says.
*/
// Blocks array. Each block is defined by a starting and ending code
// point (both included). MUST BE SORTED.
static const unsigned uniignblocks[] = {
// Start of latin-1 supplement block, up to capital A grave
0x0080, 0x00BF,
// General punctuation
0x2000, 0x206F,
// Superscripts and subscripts
0x2070, 0x209F,
// Currency symbols
0x20A0, 0x20CF,
// Letterlike symbols
0x2100, 0x214f,
// Number forms
0x2150, 0x218F,
// Arrows
0x2190, 0x21FF,
// Mathematical Operators
0x2200, 0x22FF,
// Miscellaneous Technical
0x2300, 0x23FF,
// Control Pictures
0x2400, 0x243F,
// Optical Character Recognition
0x2440, 0x245F,
// Enclosed Alphanumerics
0x2460, 0x24FF,
// Box Drawing
0x2500, 0x257F,
// Block Elements
0x2580, 0x259F,
// Geometric Shapes
0x25A0, 0x25FF,
// Miscellaneous Symbols
0x2600, 0x26FF,
// Dingbats
0x2700, 0x27BF,
// Miscellaneous Mathematical Symbols-A
0x27C0, 0x27EF,
// Supplemental Arrows-A
0x27F0, 0x27FF,
// Supplemental Arrows-B
0x2900, 0x297F,
// Miscellaneous Mathematical Symbols-B
0x2980, 0x29FF,
// Supplemental Mathematical Operators
0x2A00, 0x2AFF,
// Miscellaneous Symbols and Arrows
0x2B00, 0x2BFF,
};
static const unsigned int uniign[] = {
0x0085, /* NEXT LINE NEL;Cc */
0x00A0, /* NO-BREAK SPACE; Zs */
0x00A1, /* INVERTED EXCLAMATION MARK;Po */
0x00A2, /* CENT SIGN;Sc */
0x00A3, /* POUND SIGN;Sc; */
0x00A4, /* CURRENCY SIGN;Sc; */
0x00A5, /* YEN SIGN;Sc; */
0x00A6, /* BROKEN BAR;So */
0x00A7, /* SECTION SIGN;So; */
0x00A9, /* COPYRIGHT SIGN;So */
0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK*/
0x00AC, /* NOT SIGN;Sm */
0x00AD, /* SOFT HYPHEN*/
0x00AE, /* registered sign */
0x00B0, /* DEGREE SIGN */
0x00B1, /* PLUS-MINUS SIGN */
0x00B7, /* MIDDLE DOT */
0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
0x00BF, /* INVERTED QUESTION MARK; */
0x00D7, /* MULTIPLICATION SIGN */
0x00F7, /* DIVISION SIGN */
0x037E, /* GREEK QUESTION MARK */
0x0387, /* GREEK ANO TELEIA */
0x055C, /* ARMENIAN EXCLAMATION MARK */
@ -67,177 +101,6 @@ static const unsigned int uniign[] = {
0x1806, /* MONGOLIAN TODO SOFT HYPHEN */
0x1809, /* MONGOLIAN MANCHU FULL STOP */
0x180E, /* MONGOLIAN VOWEL SEPARATOR */
0x2000, /* EN QUAD..HAIR SPACE*/
0x2001, /* EN QUAD..HAIR SPACE*/
0x2002, /* EN QUAD..HAIR SPACE*/
0x2003, /* EN QUAD..HAIR SPACE*/
0x2004, /* EN QUAD..HAIR SPACE*/
0x2005, /* EN QUAD..HAIR SPACE*/
0x2006, /* EN QUAD..HAIR SPACE*/
0x2007, /* EN QUAD..HAIR SPACE*/
0x2008, /* EN QUAD..HAIR SPACE*/
0x2009, /* EN QUAD..HAIR SPACE*/
0x200A, /* EN QUAD..HAIR SPACE*/
0x2010, /* [2] HYPHEN..NON-BREAKING HYPHEN*/
0x2011, /* [2] HYPHEN..NON-BREAKING HYPHEN*/
0x2012, /* [6] HYPHEN..HORIZONTAL BAR*/
0x2013, /* [6] HYPHEN..HORIZONTAL BAR*/
0x2014, /* [6] HYPHEN..HORIZONTAL BAR*/
0x2015, /* [6] HYPHEN..HORIZONTAL BAR*/
0x2018, /* LEFT SINGLE QUOTATION MARK*/
0x2019, /* RIGHT SINGLE QUOTATION MARK*/
0x201A, /* SINGLE LOW-9 QUOTATION MARK*/
0x201B, /* SINGLE HIGH-REVERSED-9 QUOTATION MARK*/
0x201C, /* LEFT DOUBLE QUOTATION MARK*/
0x201D, /* RIGHT DOUBLE QUOTATION MARK*/
0x201E, /* DOUBLE LOW-9 QUOTATION MARK*/
0x201F, /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK*/
0x2022, /* BULLET */
0x2023, /* TRIANGULAR BULLET*/
0x2024, /* ONE DOT LEADER;Po;0;ON;<compat> 002E;;;;N;;;;;*/
0x2025, /* TWO DOT LEADER;Po;0;ON;<compat> 002E 002E;;;;N;;;;; */
0x2026, /* HORIZONTAL ELLIPSIS;Po;0;ON;<compat> 002E 002E 002E;;;;N;;;;; */
0x2028, /* LINE SEPARATOR */
0x2029, /* PARAGRAPH SEPARATOR */
0x202F, /* NARROW NO-BREAK SPACE */
0x2032, /* PRIME */
0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK*/
0x203C, /* [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
0x203D, /* [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
0x2047, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0x2048, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0x2049, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0x2053, /* SWUNG DASH*/
0x205F, /* MEDIUM MATHEMATICAL SPACE*/
0x207B, /* SUPERSCRIPT MINUS*/
0x208B, /* SUBSCRIPT MINUS*/
0x20A0, /* EURO-CURRENCY SIGN */
0x20A1, /* COLON SIGN */
0x20A2, /* CRUZEIRO SIGN */
0x20A3, /* FRENCH FRANC SIGN */
0x20A4, /* LIRA SIGN */
0x20A5, /* MILL SIGN */
0x20A6, /* NAIRA SIGN */
0x20A7, /* PESETA SIGN */
0x20A8, /* RUPEE SIGN */
0x20A9, /* WON SIGN */
0x20AA, /* NEW SHEQEL SIGN */
0x20AB, /* DONG SIGN */
0x20AC, /* EURO SIGN */
0x20AD, /* KIP SIGN */
0x20AE, /* TUGRIK SIGN */
0x20AF, /* DRACHMA SIGN */
0x20B0, /* GERMAN PENNY SIGN */
0x20B1, /* PESO SIGN */
0x20B2, /* GUARANI SIGN */
0x20B3, /* AUSTRAL SIGN */
0x20B4, /* HRYVNIA SIGN */
0x20B5, /* CEDI SIGN */
0x20B6, /* LIVRE TOURNOIS SIGN */
0x20B7, /* SPESMILO SIGN */
0x20B8, /* TENGE SIGN */
0x20B9, /* INDIAN RUPEE SIGN */
0x2117, /* SOUND RECORDING COPYRIGHT;So */
0x2122, /* TRADE MARK SIGN;So; */
0x2192, /* RIGHTWARDS ARROW;Sm;0;ON;;;;;N;RIGHT ARROW;;;;*/
0x2212, /* MINUS SIGN*/
0x25A0, /* BLACK SQUARE */
0x25A1, /* WHITE SQUARE */
0x25A2, /* WHITE SQUARE WITH ROUNDED CORNERS */
0x25A3, /* WHITE SQUARE CONTAINING BLACK SMALL SQUARE */
0x25A4, /* SQUARE WITH HORIZONTAL FILL */
0x25A5, /* SQUARE WITH VERTICAL FILL */
0x25A6, /* SQUARE WITH ORTHOGONAL CROSSHATCH FILL */
0x25A7, /* SQUARE WITH UPPER LEFT TO LOWER RIGHT FILL */
0x25A8, /* SQUARE WITH UPPER RIGHT TO LOWER LEFT FILL */
0x25A9, /* SQUARE WITH DIAGONAL CROSSHATCH FILL */
0x25AA, /* BLACK SMALL SQUARE */
0x25AB, /* WHITE SMALL SQUARE */
0x25AC, /* BLACK RECTANGLE */
0x25AD, /* WHITE RECTANGLE */
0x25AE, /* BLACK VERTICAL RECTANGLE */
0x25AF, /* WHITE VERTICAL RECTANGLE */
0x25B0, /* BLACK PARALLELOGRAM */
0x25B1, /* WHITE PARALLELOGRAM */
0x25B2, /* BLACK UP-POINTING TRIANGLE */
0x25B3, /* WHITE UP-POINTING TRIANGLE */
0x25B4, /* BLACK UP-POINTING SMALL TRIANGLE */
0x25B5, /* WHITE UP-POINTING SMALL TRIANGLE */
0x25B6, /* BLACK RIGHT-POINTING TRIANGLE */
0x25B7, /* WHITE RIGHT-POINTING TRIANGLE */
0x25B8, /* BLACK RIGHT-POINTING SMALL TRIANGLE */
0x25B9, /* WHITE RIGHT-POINTING SMALL TRIANGLE */
0x25BA, /* BLACK RIGHT-POINTING POINTER */
0x25BB, /* WHITE RIGHT-POINTING POINTER */
0x25BC, /* BLACK DOWN-POINTING TRIANGLE */
0x25BD, /* WHITE DOWN-POINTING TRIANGLE */
0x25BE, /* BLACK DOWN-POINTING SMALL TRIANGLE */
0x25BF, /* WHITE DOWN-POINTING SMALL TRIANGLE */
0x25C0, /* BLACK LEFT-POINTING TRIANGLE */
0x25C1, /* WHITE LEFT-POINTING TRIANGLE */
0x25C2, /* BLACK LEFT-POINTING SMALL TRIANGLE */
0x25C3, /* WHITE LEFT-POINTING SMALL TRIANGLE */
0x25C4, /* BLACK LEFT-POINTING POINTER */
0x25C5, /* WHITE LEFT-POINTING POINTER */
0x25C6, /* BLACK DIAMOND */
0x25C7, /* WHITE DIAMOND */
0x25C8, /* WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND */
0x25C9, /* FISHEYE */
0x25CA, /* LOZENGE */
0x25CB, /* WHITE CIRCLE */
0x25CC, /* DOTTED CIRCLE */
0x25CD, /* CIRCLE WITH VERTICAL FILL */
0x25CE, /* BULLSEYE */
0x25CF, /* BLACK CIRCLE */
0x25D0, /* CIRCLE WITH LEFT HALF BLACK */
0x25D1, /* CIRCLE WITH RIGHT HALF BLACK */
0x25D2, /* CIRCLE WITH LOWER HALF BLACK */
0x25D3, /* CIRCLE WITH UPPER HALF BLACK */
0x25D4, /* CIRCLE WITH UPPER RIGHT QUADRANT BLACK */
0x25D5, /* CIRCLE WITH ALL BUT UPPER LEFT QUADRANT BLACK */
0x25D6, /* LEFT HALF BLACK CIRCLE */
0x25D7, /* RIGHT HALF BLACK CIRCLE */
0x25D8, /* INVERSE BULLET */
0x25D9, /* INVERSE WHITE CIRCLE */
0x25DA, /* UPPER HALF INVERSE WHITE CIRCLE */
0x25DB, /* LOWER HALF INVERSE WHITE CIRCLE */
0x25DC, /* UPPER LEFT QUADRANT CIRCULAR ARC */
0x25DD, /* UPPER RIGHT QUADRANT CIRCULAR ARC */
0x25DE, /* LOWER RIGHT QUADRANT CIRCULAR ARC */
0x25DF, /* LOWER LEFT QUADRANT CIRCULAR ARC */
0x25E0, /* UPPER HALF CIRCLE */
0x25E1, /* LOWER HALF CIRCLE */
0x25E2, /* BLACK LOWER RIGHT TRIANGLE */
0x25E3, /* BLACK LOWER LEFT TRIANGLE */
0x25E4, /* BLACK UPPER LEFT TRIANGLE */
0x25E5, /* BLACK UPPER RIGHT TRIANGLE */
0x25E6, /* WHITE BULLET */
0x25E7, /* SQUARE WITH LEFT HALF BLACK */
0x25E8, /* SQUARE WITH RIGHT HALF BLACK */
0x25E9, /* SQUARE WITH UPPER LEFT DIAGONAL HALF BLACK */
0x25EA, /* SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK */
0x25EB, /* WHITE SQUARE WITH VERTICAL BISECTING LINE */
0x25EC, /* WHITE UP-POINTING TRIANGLE WITH DOT */
0x25ED, /* UP-POINTING TRIANGLE WITH LEFT HALF BLACK */
0x25EE, /* UP-POINTING TRIANGLE WITH RIGHT HALF BLACK */
0x25EF, /* LARGE CIRCLE */
0x25F0, /* WHITE SQUARE WITH UPPER LEFT QUADRANT */
0x25F1, /* WHITE SQUARE WITH LOWER LEFT QUADRANT */
0x25F2, /* WHITE SQUARE WITH LOWER RIGHT QUADRANT */
0x25F3, /* WHITE SQUARE WITH UPPER RIGHT QUADRANT */
0x25F4, /* WHITE CIRCLE WITH UPPER LEFT QUADRANT */
0x25F5, /* WHITE CIRCLE WITH LOWER LEFT QUADRANT */
0x25F6, /* WHITE CIRCLE WITH LOWER RIGHT QUADRANT */
0x25F7, /* WHITE CIRCLE WITH UPPER RIGHT QUADRANT */
0x25F8, /* UPPER LEFT TRIANGLE */
0x25F9, /* UPPER RIGHT TRIANGLE */
0x25FA, /* LOWER LEFT TRIANGLE */
0x25FB, /* WHITE MEDIUM SQUARE */
0x25FC, /* BLACK MEDIUM SQUARE */
0x25FD, /* WHITE MEDIUM SMALL SQUARE */
0x25FE, /* BLACK MEDIUM SMALL SQUARE */
0x25FF, /* LOWER RIGHT TRIANGLE */
0x2E2E, /* REVERSED QUESTION MARK;Po;0;ON;;;;;N;;;;; */
0x3000, /* IDEOGRAPHIC SPACE*/
0x3002, /* IDEOGRAPHIC FULL STOP*/