attempt to eliminate more unicode uninteresting characters
This commit is contained in:
parent
4a17bac9e3
commit
efd319025d
@ -62,6 +62,7 @@ static int charclasses[charclasses_size];
|
||||
// common cases
|
||||
static unordered_set<unsigned int> unicign;
|
||||
static unordered_set<unsigned int> visiblewhite;
|
||||
static vector<unsigned int> vignblocks;
|
||||
|
||||
class CharClassInit {
|
||||
public:
|
||||
@ -98,6 +99,15 @@ public:
|
||||
}
|
||||
unicign.insert((unsigned int)-1);
|
||||
|
||||
for (i = 0; i < sizeof(uniignblocks) / sizeof(int); i++) {
|
||||
vignblocks.push_back(uniignblocks[i]);
|
||||
}
|
||||
if (vignblocks.size() % 2) {
|
||||
LOGFATAL(("Fatal internal error: unicode ign blocks array "
|
||||
"size not even\n"));
|
||||
abort();
|
||||
}
|
||||
|
||||
for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
|
||||
visiblewhite.insert(avsbwht[i]);
|
||||
}
|
||||
@ -110,13 +120,35 @@ static inline int whatcc(unsigned int c)
|
||||
if (c <= 127) {
|
||||
return charclasses[c];
|
||||
} else {
|
||||
if (unicign.find(c) != unicign.end())
|
||||
if (unicign.find(c) != unicign.end()) {
|
||||
return SPACE;
|
||||
else
|
||||
return LETTER;
|
||||
} else {
|
||||
vector<unsigned int>::iterator it =
|
||||
lower_bound(vignblocks.begin(), vignblocks.end(), c);
|
||||
if (c == *it)
|
||||
return SPACE;
|
||||
if ((it - vignblocks.begin()) % 2 == 1) {
|
||||
return SPACE;
|
||||
} else {
|
||||
return LETTER;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// testing whatcc...
|
||||
#if 0
|
||||
unsigned int testvalues[] = {'a', '0', 0x80, 0xbf, 0xc0, 0x05c3, 0x1000,
|
||||
0x2000, 0x2001, 0x206e, 0x206f, 0x20d0, 0x2399,
|
||||
0x2400, 0x2401, 0x243f, 0x2440, 0xff65};
|
||||
int ntest = sizeof(testvalues) / sizeof(int);
|
||||
for (int i = 0; i < ntest; i++) {
|
||||
int ret = whatcc(testvalues[i]);
|
||||
printf("Tested value 0x%x, returned value %d %s\n",
|
||||
testvalues[i], ret, ret == LETTER ? "LETTER" :
|
||||
ret == SPACE ? "SPACE" : "OTHER");
|
||||
}
|
||||
#endif
|
||||
|
||||
// CJK Unicode character detection:
|
||||
//
|
||||
|
||||
@ -26,27 +26,61 @@
|
||||
* the correct way would be to do what http://www.unicode.org/reports/tr29/
|
||||
* says.
|
||||
*/
|
||||
|
||||
// Blocks array. Each block is defined by a starting and ending code
|
||||
// point (both included). MUST BE SORTED.
|
||||
static const unsigned uniignblocks[] = {
|
||||
// Start of latin-1 supplement block, up to capital A grave
|
||||
0x0080, 0x00BF,
|
||||
// General punctuation
|
||||
0x2000, 0x206F,
|
||||
// Superscripts and subscripts
|
||||
0x2070, 0x209F,
|
||||
// Currency symbols
|
||||
0x20A0, 0x20CF,
|
||||
// Letterlike symbols
|
||||
0x2100, 0x214f,
|
||||
// Number forms
|
||||
0x2150, 0x218F,
|
||||
// Arrows
|
||||
0x2190, 0x21FF,
|
||||
// Mathematical Operators
|
||||
0x2200, 0x22FF,
|
||||
// Miscellaneous Technical
|
||||
0x2300, 0x23FF,
|
||||
// Control Pictures
|
||||
0x2400, 0x243F,
|
||||
// Optical Character Recognition
|
||||
0x2440, 0x245F,
|
||||
// Enclosed Alphanumerics
|
||||
0x2460, 0x24FF,
|
||||
// Box Drawing
|
||||
0x2500, 0x257F,
|
||||
// Block Elements
|
||||
0x2580, 0x259F,
|
||||
// Geometric Shapes
|
||||
0x25A0, 0x25FF,
|
||||
// Miscellaneous Symbols
|
||||
0x2600, 0x26FF,
|
||||
// Dingbats
|
||||
0x2700, 0x27BF,
|
||||
// Miscellaneous Mathematical Symbols-A
|
||||
0x27C0, 0x27EF,
|
||||
// Supplemental Arrows-A
|
||||
0x27F0, 0x27FF,
|
||||
// Supplemental Arrows-B
|
||||
0x2900, 0x297F,
|
||||
// Miscellaneous Mathematical Symbols-B
|
||||
0x2980, 0x29FF,
|
||||
// Supplemental Mathematical Operators
|
||||
0x2A00, 0x2AFF,
|
||||
// Miscellaneous Symbols and Arrows
|
||||
0x2B00, 0x2BFF,
|
||||
};
|
||||
|
||||
static const unsigned int uniign[] = {
|
||||
0x0085, /* NEXT LINE NEL;Cc */
|
||||
0x00A0, /* NO-BREAK SPACE; Zs */
|
||||
0x00A1, /* INVERTED EXCLAMATION MARK;Po */
|
||||
0x00A2, /* CENT SIGN;Sc */
|
||||
0x00A3, /* POUND SIGN;Sc; */
|
||||
0x00A4, /* CURRENCY SIGN;Sc; */
|
||||
0x00A5, /* YEN SIGN;Sc; */
|
||||
0x00A6, /* BROKEN BAR;So */
|
||||
0x00A7, /* SECTION SIGN;So; */
|
||||
0x00A9, /* COPYRIGHT SIGN;So */
|
||||
0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK*/
|
||||
0x00AC, /* NOT SIGN;Sm */
|
||||
0x00AD, /* SOFT HYPHEN*/
|
||||
0x00AE, /* registered sign */
|
||||
0x00B0, /* DEGREE SIGN */
|
||||
0x00B1, /* PLUS-MINUS SIGN */
|
||||
0x00B7, /* MIDDLE DOT */
|
||||
0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
|
||||
0x00BF, /* INVERTED QUESTION MARK; */
|
||||
0x00D7, /* MULTIPLICATION SIGN */
|
||||
0x00F7, /* DIVISION SIGN */
|
||||
0x037E, /* GREEK QUESTION MARK */
|
||||
0x0387, /* GREEK ANO TELEIA */
|
||||
0x055C, /* ARMENIAN EXCLAMATION MARK */
|
||||
@ -67,177 +101,6 @@ static const unsigned int uniign[] = {
|
||||
0x1806, /* MONGOLIAN TODO SOFT HYPHEN */
|
||||
0x1809, /* MONGOLIAN MANCHU FULL STOP */
|
||||
0x180E, /* MONGOLIAN VOWEL SEPARATOR */
|
||||
0x2000, /* EN QUAD..HAIR SPACE*/
|
||||
0x2001, /* EN QUAD..HAIR SPACE*/
|
||||
0x2002, /* EN QUAD..HAIR SPACE*/
|
||||
0x2003, /* EN QUAD..HAIR SPACE*/
|
||||
0x2004, /* EN QUAD..HAIR SPACE*/
|
||||
0x2005, /* EN QUAD..HAIR SPACE*/
|
||||
0x2006, /* EN QUAD..HAIR SPACE*/
|
||||
0x2007, /* EN QUAD..HAIR SPACE*/
|
||||
0x2008, /* EN QUAD..HAIR SPACE*/
|
||||
0x2009, /* EN QUAD..HAIR SPACE*/
|
||||
0x200A, /* EN QUAD..HAIR SPACE*/
|
||||
0x2010, /* [2] HYPHEN..NON-BREAKING HYPHEN*/
|
||||
0x2011, /* [2] HYPHEN..NON-BREAKING HYPHEN*/
|
||||
0x2012, /* [6] HYPHEN..HORIZONTAL BAR*/
|
||||
0x2013, /* [6] HYPHEN..HORIZONTAL BAR*/
|
||||
0x2014, /* [6] HYPHEN..HORIZONTAL BAR*/
|
||||
0x2015, /* [6] HYPHEN..HORIZONTAL BAR*/
|
||||
0x2018, /* LEFT SINGLE QUOTATION MARK*/
|
||||
0x2019, /* RIGHT SINGLE QUOTATION MARK*/
|
||||
0x201A, /* SINGLE LOW-9 QUOTATION MARK*/
|
||||
0x201B, /* SINGLE HIGH-REVERSED-9 QUOTATION MARK*/
|
||||
0x201C, /* LEFT DOUBLE QUOTATION MARK*/
|
||||
0x201D, /* RIGHT DOUBLE QUOTATION MARK*/
|
||||
0x201E, /* DOUBLE LOW-9 QUOTATION MARK*/
|
||||
0x201F, /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK*/
|
||||
0x2022, /* BULLET */
|
||||
0x2023, /* TRIANGULAR BULLET*/
|
||||
0x2024, /* ONE DOT LEADER;Po;0;ON;<compat> 002E;;;;N;;;;;*/
|
||||
0x2025, /* TWO DOT LEADER;Po;0;ON;<compat> 002E 002E;;;;N;;;;; */
|
||||
0x2026, /* HORIZONTAL ELLIPSIS;Po;0;ON;<compat> 002E 002E 002E;;;;N;;;;; */
|
||||
0x2028, /* LINE SEPARATOR */
|
||||
0x2029, /* PARAGRAPH SEPARATOR */
|
||||
0x202F, /* NARROW NO-BREAK SPACE */
|
||||
0x2032, /* PRIME */
|
||||
0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
|
||||
0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK*/
|
||||
0x203C, /* [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
|
||||
0x203D, /* [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
|
||||
0x2047, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||
0x2048, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||
0x2049, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
|
||||
0x2053, /* SWUNG DASH*/
|
||||
0x205F, /* MEDIUM MATHEMATICAL SPACE*/
|
||||
0x207B, /* SUPERSCRIPT MINUS*/
|
||||
0x208B, /* SUBSCRIPT MINUS*/
|
||||
0x20A0, /* EURO-CURRENCY SIGN */
|
||||
0x20A1, /* COLON SIGN */
|
||||
0x20A2, /* CRUZEIRO SIGN */
|
||||
0x20A3, /* FRENCH FRANC SIGN */
|
||||
0x20A4, /* LIRA SIGN */
|
||||
0x20A5, /* MILL SIGN */
|
||||
0x20A6, /* NAIRA SIGN */
|
||||
0x20A7, /* PESETA SIGN */
|
||||
0x20A8, /* RUPEE SIGN */
|
||||
0x20A9, /* WON SIGN */
|
||||
0x20AA, /* NEW SHEQEL SIGN */
|
||||
0x20AB, /* DONG SIGN */
|
||||
0x20AC, /* EURO SIGN */
|
||||
0x20AD, /* KIP SIGN */
|
||||
0x20AE, /* TUGRIK SIGN */
|
||||
0x20AF, /* DRACHMA SIGN */
|
||||
0x20B0, /* GERMAN PENNY SIGN */
|
||||
0x20B1, /* PESO SIGN */
|
||||
0x20B2, /* GUARANI SIGN */
|
||||
0x20B3, /* AUSTRAL SIGN */
|
||||
0x20B4, /* HRYVNIA SIGN */
|
||||
0x20B5, /* CEDI SIGN */
|
||||
0x20B6, /* LIVRE TOURNOIS SIGN */
|
||||
0x20B7, /* SPESMILO SIGN */
|
||||
0x20B8, /* TENGE SIGN */
|
||||
0x20B9, /* INDIAN RUPEE SIGN */
|
||||
0x2117, /* SOUND RECORDING COPYRIGHT;So */
|
||||
0x2122, /* TRADE MARK SIGN;So; */
|
||||
0x2192, /* RIGHTWARDS ARROW;Sm;0;ON;;;;;N;RIGHT ARROW;;;;*/
|
||||
0x2212, /* MINUS SIGN*/
|
||||
0x25A0, /* BLACK SQUARE */
|
||||
0x25A1, /* WHITE SQUARE */
|
||||
0x25A2, /* WHITE SQUARE WITH ROUNDED CORNERS */
|
||||
0x25A3, /* WHITE SQUARE CONTAINING BLACK SMALL SQUARE */
|
||||
0x25A4, /* SQUARE WITH HORIZONTAL FILL */
|
||||
0x25A5, /* SQUARE WITH VERTICAL FILL */
|
||||
0x25A6, /* SQUARE WITH ORTHOGONAL CROSSHATCH FILL */
|
||||
0x25A7, /* SQUARE WITH UPPER LEFT TO LOWER RIGHT FILL */
|
||||
0x25A8, /* SQUARE WITH UPPER RIGHT TO LOWER LEFT FILL */
|
||||
0x25A9, /* SQUARE WITH DIAGONAL CROSSHATCH FILL */
|
||||
0x25AA, /* BLACK SMALL SQUARE */
|
||||
0x25AB, /* WHITE SMALL SQUARE */
|
||||
0x25AC, /* BLACK RECTANGLE */
|
||||
0x25AD, /* WHITE RECTANGLE */
|
||||
0x25AE, /* BLACK VERTICAL RECTANGLE */
|
||||
0x25AF, /* WHITE VERTICAL RECTANGLE */
|
||||
0x25B0, /* BLACK PARALLELOGRAM */
|
||||
0x25B1, /* WHITE PARALLELOGRAM */
|
||||
0x25B2, /* BLACK UP-POINTING TRIANGLE */
|
||||
0x25B3, /* WHITE UP-POINTING TRIANGLE */
|
||||
0x25B4, /* BLACK UP-POINTING SMALL TRIANGLE */
|
||||
0x25B5, /* WHITE UP-POINTING SMALL TRIANGLE */
|
||||
0x25B6, /* BLACK RIGHT-POINTING TRIANGLE */
|
||||
0x25B7, /* WHITE RIGHT-POINTING TRIANGLE */
|
||||
0x25B8, /* BLACK RIGHT-POINTING SMALL TRIANGLE */
|
||||
0x25B9, /* WHITE RIGHT-POINTING SMALL TRIANGLE */
|
||||
0x25BA, /* BLACK RIGHT-POINTING POINTER */
|
||||
0x25BB, /* WHITE RIGHT-POINTING POINTER */
|
||||
0x25BC, /* BLACK DOWN-POINTING TRIANGLE */
|
||||
0x25BD, /* WHITE DOWN-POINTING TRIANGLE */
|
||||
0x25BE, /* BLACK DOWN-POINTING SMALL TRIANGLE */
|
||||
0x25BF, /* WHITE DOWN-POINTING SMALL TRIANGLE */
|
||||
0x25C0, /* BLACK LEFT-POINTING TRIANGLE */
|
||||
0x25C1, /* WHITE LEFT-POINTING TRIANGLE */
|
||||
0x25C2, /* BLACK LEFT-POINTING SMALL TRIANGLE */
|
||||
0x25C3, /* WHITE LEFT-POINTING SMALL TRIANGLE */
|
||||
0x25C4, /* BLACK LEFT-POINTING POINTER */
|
||||
0x25C5, /* WHITE LEFT-POINTING POINTER */
|
||||
0x25C6, /* BLACK DIAMOND */
|
||||
0x25C7, /* WHITE DIAMOND */
|
||||
0x25C8, /* WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND */
|
||||
0x25C9, /* FISHEYE */
|
||||
0x25CA, /* LOZENGE */
|
||||
0x25CB, /* WHITE CIRCLE */
|
||||
0x25CC, /* DOTTED CIRCLE */
|
||||
0x25CD, /* CIRCLE WITH VERTICAL FILL */
|
||||
0x25CE, /* BULLSEYE */
|
||||
0x25CF, /* BLACK CIRCLE */
|
||||
0x25D0, /* CIRCLE WITH LEFT HALF BLACK */
|
||||
0x25D1, /* CIRCLE WITH RIGHT HALF BLACK */
|
||||
0x25D2, /* CIRCLE WITH LOWER HALF BLACK */
|
||||
0x25D3, /* CIRCLE WITH UPPER HALF BLACK */
|
||||
0x25D4, /* CIRCLE WITH UPPER RIGHT QUADRANT BLACK */
|
||||
0x25D5, /* CIRCLE WITH ALL BUT UPPER LEFT QUADRANT BLACK */
|
||||
0x25D6, /* LEFT HALF BLACK CIRCLE */
|
||||
0x25D7, /* RIGHT HALF BLACK CIRCLE */
|
||||
0x25D8, /* INVERSE BULLET */
|
||||
0x25D9, /* INVERSE WHITE CIRCLE */
|
||||
0x25DA, /* UPPER HALF INVERSE WHITE CIRCLE */
|
||||
0x25DB, /* LOWER HALF INVERSE WHITE CIRCLE */
|
||||
0x25DC, /* UPPER LEFT QUADRANT CIRCULAR ARC */
|
||||
0x25DD, /* UPPER RIGHT QUADRANT CIRCULAR ARC */
|
||||
0x25DE, /* LOWER RIGHT QUADRANT CIRCULAR ARC */
|
||||
0x25DF, /* LOWER LEFT QUADRANT CIRCULAR ARC */
|
||||
0x25E0, /* UPPER HALF CIRCLE */
|
||||
0x25E1, /* LOWER HALF CIRCLE */
|
||||
0x25E2, /* BLACK LOWER RIGHT TRIANGLE */
|
||||
0x25E3, /* BLACK LOWER LEFT TRIANGLE */
|
||||
0x25E4, /* BLACK UPPER LEFT TRIANGLE */
|
||||
0x25E5, /* BLACK UPPER RIGHT TRIANGLE */
|
||||
0x25E6, /* WHITE BULLET */
|
||||
0x25E7, /* SQUARE WITH LEFT HALF BLACK */
|
||||
0x25E8, /* SQUARE WITH RIGHT HALF BLACK */
|
||||
0x25E9, /* SQUARE WITH UPPER LEFT DIAGONAL HALF BLACK */
|
||||
0x25EA, /* SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK */
|
||||
0x25EB, /* WHITE SQUARE WITH VERTICAL BISECTING LINE */
|
||||
0x25EC, /* WHITE UP-POINTING TRIANGLE WITH DOT */
|
||||
0x25ED, /* UP-POINTING TRIANGLE WITH LEFT HALF BLACK */
|
||||
0x25EE, /* UP-POINTING TRIANGLE WITH RIGHT HALF BLACK */
|
||||
0x25EF, /* LARGE CIRCLE */
|
||||
0x25F0, /* WHITE SQUARE WITH UPPER LEFT QUADRANT */
|
||||
0x25F1, /* WHITE SQUARE WITH LOWER LEFT QUADRANT */
|
||||
0x25F2, /* WHITE SQUARE WITH LOWER RIGHT QUADRANT */
|
||||
0x25F3, /* WHITE SQUARE WITH UPPER RIGHT QUADRANT */
|
||||
0x25F4, /* WHITE CIRCLE WITH UPPER LEFT QUADRANT */
|
||||
0x25F5, /* WHITE CIRCLE WITH LOWER LEFT QUADRANT */
|
||||
0x25F6, /* WHITE CIRCLE WITH LOWER RIGHT QUADRANT */
|
||||
0x25F7, /* WHITE CIRCLE WITH UPPER RIGHT QUADRANT */
|
||||
0x25F8, /* UPPER LEFT TRIANGLE */
|
||||
0x25F9, /* UPPER RIGHT TRIANGLE */
|
||||
0x25FA, /* LOWER LEFT TRIANGLE */
|
||||
0x25FB, /* WHITE MEDIUM SQUARE */
|
||||
0x25FC, /* BLACK MEDIUM SQUARE */
|
||||
0x25FD, /* WHITE MEDIUM SMALL SQUARE */
|
||||
0x25FE, /* BLACK MEDIUM SMALL SQUARE */
|
||||
0x25FF, /* LOWER RIGHT TRIANGLE */
|
||||
0x2E2E, /* REVERSED QUESTION MARK;Po;0;ON;;;;;N;;;;; */
|
||||
0x3000, /* IDEOGRAPHIC SPACE*/
|
||||
0x3002, /* IDEOGRAPHIC FULL STOP*/
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user