From efd319025d8b7301e24ed10246162c6a43578d74 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 2 Oct 2012 17:45:16 +0200 Subject: [PATCH] attempt to eliminate more unicode uninteresting characters --- src/common/textsplit.cpp | 38 +++++- src/common/uproplist.h | 243 +++++++++------------------------------ 2 files changed, 88 insertions(+), 193 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 764b8cab..7afd163f 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -62,6 +62,7 @@ static int charclasses[charclasses_size]; // common cases static unordered_set unicign; static unordered_set visiblewhite; +static vector vignblocks; class CharClassInit { public: @@ -98,6 +99,15 @@ public: } unicign.insert((unsigned int)-1); + for (i = 0; i < sizeof(uniignblocks) / sizeof(int); i++) { + vignblocks.push_back(uniignblocks[i]); + } + if (vignblocks.size() % 2) { + LOGFATAL(("Fatal internal error: unicode ign blocks array " + "size not even\n")); + abort(); + } + for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) { visiblewhite.insert(avsbwht[i]); } @@ -110,13 +120,35 @@ static inline int whatcc(unsigned int c) if (c <= 127) { return charclasses[c]; } else { - if (unicign.find(c) != unicign.end()) + if (unicign.find(c) != unicign.end()) { return SPACE; - else - return LETTER; + } else { + vector::iterator it = + lower_bound(vignblocks.begin(), vignblocks.end(), c); + if (c == *it) + return SPACE; + if ((it - vignblocks.begin()) % 2 == 1) { + return SPACE; + } else { + return LETTER; + } + } } } +// testing whatcc... +#if 0 + unsigned int testvalues[] = {'a', '0', 0x80, 0xbf, 0xc0, 0x05c3, 0x1000, + 0x2000, 0x2001, 0x206e, 0x206f, 0x20d0, 0x2399, + 0x2400, 0x2401, 0x243f, 0x2440, 0xff65}; + int ntest = sizeof(testvalues) / sizeof(int); + for (int i = 0; i < ntest; i++) { + int ret = whatcc(testvalues[i]); + printf("Tested value 0x%x, returned value %d %s\n", + testvalues[i], ret, ret == LETTER ? "LETTER" : + ret == SPACE ? "SPACE" : "OTHER"); + } +#endif // CJK Unicode character detection: // diff --git a/src/common/uproplist.h b/src/common/uproplist.h index c8a673ab..016bf922 100644 --- a/src/common/uproplist.h +++ b/src/common/uproplist.h @@ -26,27 +26,61 @@ * the correct way would be to do what http://www.unicode.org/reports/tr29/ * says. */ + +// Blocks array. Each block is defined by a starting and ending code +// point (both included). MUST BE SORTED. +static const unsigned uniignblocks[] = { + // Start of latin-1 supplement block, up to capital A grave + 0x0080, 0x00BF, + // General punctuation + 0x2000, 0x206F, + // Superscripts and subscripts + 0x2070, 0x209F, + // Currency symbols + 0x20A0, 0x20CF, + // Letterlike symbols + 0x2100, 0x214f, + // Number forms + 0x2150, 0x218F, + // Arrows + 0x2190, 0x21FF, + // Mathematical Operators + 0x2200, 0x22FF, + // Miscellaneous Technical + 0x2300, 0x23FF, + // Control Pictures + 0x2400, 0x243F, + // Optical Character Recognition + 0x2440, 0x245F, + // Enclosed Alphanumerics + 0x2460, 0x24FF, + // Box Drawing + 0x2500, 0x257F, + // Block Elements + 0x2580, 0x259F, + // Geometric Shapes + 0x25A0, 0x25FF, + // Miscellaneous Symbols + 0x2600, 0x26FF, + // Dingbats + 0x2700, 0x27BF, + // Miscellaneous Mathematical Symbols-A + 0x27C0, 0x27EF, + // Supplemental Arrows-A + 0x27F0, 0x27FF, + // Supplemental Arrows-B + 0x2900, 0x297F, + // Miscellaneous Mathematical Symbols-B + 0x2980, 0x29FF, + // Supplemental Mathematical Operators + 0x2A00, 0x2AFF, + // Miscellaneous Symbols and Arrows + 0x2B00, 0x2BFF, +}; + static const unsigned int uniign[] = { - 0x0085, /* NEXT LINE NEL;Cc */ - 0x00A0, /* NO-BREAK SPACE; Zs */ - 0x00A1, /* INVERTED EXCLAMATION MARK;Po */ - 0x00A2, /* CENT SIGN;Sc */ - 0x00A3, /* POUND SIGN;Sc; */ - 0x00A4, /* CURRENCY SIGN;Sc; */ - 0x00A5, /* YEN SIGN;Sc; */ - 0x00A6, /* BROKEN BAR;So */ - 0x00A7, /* SECTION SIGN;So; */ - 0x00A9, /* COPYRIGHT SIGN;So */ - 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK*/ - 0x00AC, /* NOT SIGN;Sm */ - 0x00AD, /* SOFT HYPHEN*/ - 0x00AE, /* registered sign */ - 0x00B0, /* DEGREE SIGN */ - 0x00B1, /* PLUS-MINUS SIGN */ - 0x00B7, /* MIDDLE DOT */ - 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */ - 0x00BF, /* INVERTED QUESTION MARK; */ 0x00D7, /* MULTIPLICATION SIGN */ + 0x00F7, /* DIVISION SIGN */ 0x037E, /* GREEK QUESTION MARK */ 0x0387, /* GREEK ANO TELEIA */ 0x055C, /* ARMENIAN EXCLAMATION MARK */ @@ -67,177 +101,6 @@ static const unsigned int uniign[] = { 0x1806, /* MONGOLIAN TODO SOFT HYPHEN */ 0x1809, /* MONGOLIAN MANCHU FULL STOP */ 0x180E, /* MONGOLIAN VOWEL SEPARATOR */ - 0x2000, /* EN QUAD..HAIR SPACE*/ - 0x2001, /* EN QUAD..HAIR SPACE*/ - 0x2002, /* EN QUAD..HAIR SPACE*/ - 0x2003, /* EN QUAD..HAIR SPACE*/ - 0x2004, /* EN QUAD..HAIR SPACE*/ - 0x2005, /* EN QUAD..HAIR SPACE*/ - 0x2006, /* EN QUAD..HAIR SPACE*/ - 0x2007, /* EN QUAD..HAIR SPACE*/ - 0x2008, /* EN QUAD..HAIR SPACE*/ - 0x2009, /* EN QUAD..HAIR SPACE*/ - 0x200A, /* EN QUAD..HAIR SPACE*/ - 0x2010, /* [2] HYPHEN..NON-BREAKING HYPHEN*/ - 0x2011, /* [2] HYPHEN..NON-BREAKING HYPHEN*/ - 0x2012, /* [6] HYPHEN..HORIZONTAL BAR*/ - 0x2013, /* [6] HYPHEN..HORIZONTAL BAR*/ - 0x2014, /* [6] HYPHEN..HORIZONTAL BAR*/ - 0x2015, /* [6] HYPHEN..HORIZONTAL BAR*/ - 0x2018, /* LEFT SINGLE QUOTATION MARK*/ - 0x2019, /* RIGHT SINGLE QUOTATION MARK*/ - 0x201A, /* SINGLE LOW-9 QUOTATION MARK*/ - 0x201B, /* SINGLE HIGH-REVERSED-9 QUOTATION MARK*/ - 0x201C, /* LEFT DOUBLE QUOTATION MARK*/ - 0x201D, /* RIGHT DOUBLE QUOTATION MARK*/ - 0x201E, /* DOUBLE LOW-9 QUOTATION MARK*/ - 0x201F, /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK*/ - 0x2022, /* BULLET */ - 0x2023, /* TRIANGULAR BULLET*/ - 0x2024, /* ONE DOT LEADER;Po;0;ON; 002E;;;;N;;;;;*/ - 0x2025, /* TWO DOT LEADER;Po;0;ON; 002E 002E;;;;N;;;;; */ - 0x2026, /* HORIZONTAL ELLIPSIS;Po;0;ON; 002E 002E 002E;;;;N;;;;; */ - 0x2028, /* LINE SEPARATOR */ - 0x2029, /* PARAGRAPH SEPARATOR */ - 0x202F, /* NARROW NO-BREAK SPACE */ - 0x2032, /* PRIME */ - 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ - 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK*/ - 0x203C, /* [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/ - 0x203D, /* [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/ - 0x2047, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ - 0x2048, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ - 0x2049, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ - 0x2053, /* SWUNG DASH*/ - 0x205F, /* MEDIUM MATHEMATICAL SPACE*/ - 0x207B, /* SUPERSCRIPT MINUS*/ - 0x208B, /* SUBSCRIPT MINUS*/ - 0x20A0, /* EURO-CURRENCY SIGN */ - 0x20A1, /* COLON SIGN */ - 0x20A2, /* CRUZEIRO SIGN */ - 0x20A3, /* FRENCH FRANC SIGN */ - 0x20A4, /* LIRA SIGN */ - 0x20A5, /* MILL SIGN */ - 0x20A6, /* NAIRA SIGN */ - 0x20A7, /* PESETA SIGN */ - 0x20A8, /* RUPEE SIGN */ - 0x20A9, /* WON SIGN */ - 0x20AA, /* NEW SHEQEL SIGN */ - 0x20AB, /* DONG SIGN */ - 0x20AC, /* EURO SIGN */ - 0x20AD, /* KIP SIGN */ - 0x20AE, /* TUGRIK SIGN */ - 0x20AF, /* DRACHMA SIGN */ - 0x20B0, /* GERMAN PENNY SIGN */ - 0x20B1, /* PESO SIGN */ - 0x20B2, /* GUARANI SIGN */ - 0x20B3, /* AUSTRAL SIGN */ - 0x20B4, /* HRYVNIA SIGN */ - 0x20B5, /* CEDI SIGN */ - 0x20B6, /* LIVRE TOURNOIS SIGN */ - 0x20B7, /* SPESMILO SIGN */ - 0x20B8, /* TENGE SIGN */ - 0x20B9, /* INDIAN RUPEE SIGN */ - 0x2117, /* SOUND RECORDING COPYRIGHT;So */ - 0x2122, /* TRADE MARK SIGN;So; */ - 0x2192, /* RIGHTWARDS ARROW;Sm;0;ON;;;;;N;RIGHT ARROW;;;;*/ - 0x2212, /* MINUS SIGN*/ - 0x25A0, /* BLACK SQUARE */ - 0x25A1, /* WHITE SQUARE */ - 0x25A2, /* WHITE SQUARE WITH ROUNDED CORNERS */ - 0x25A3, /* WHITE SQUARE CONTAINING BLACK SMALL SQUARE */ - 0x25A4, /* SQUARE WITH HORIZONTAL FILL */ - 0x25A5, /* SQUARE WITH VERTICAL FILL */ - 0x25A6, /* SQUARE WITH ORTHOGONAL CROSSHATCH FILL */ - 0x25A7, /* SQUARE WITH UPPER LEFT TO LOWER RIGHT FILL */ - 0x25A8, /* SQUARE WITH UPPER RIGHT TO LOWER LEFT FILL */ - 0x25A9, /* SQUARE WITH DIAGONAL CROSSHATCH FILL */ - 0x25AA, /* BLACK SMALL SQUARE */ - 0x25AB, /* WHITE SMALL SQUARE */ - 0x25AC, /* BLACK RECTANGLE */ - 0x25AD, /* WHITE RECTANGLE */ - 0x25AE, /* BLACK VERTICAL RECTANGLE */ - 0x25AF, /* WHITE VERTICAL RECTANGLE */ - 0x25B0, /* BLACK PARALLELOGRAM */ - 0x25B1, /* WHITE PARALLELOGRAM */ - 0x25B2, /* BLACK UP-POINTING TRIANGLE */ - 0x25B3, /* WHITE UP-POINTING TRIANGLE */ - 0x25B4, /* BLACK UP-POINTING SMALL TRIANGLE */ - 0x25B5, /* WHITE UP-POINTING SMALL TRIANGLE */ - 0x25B6, /* BLACK RIGHT-POINTING TRIANGLE */ - 0x25B7, /* WHITE RIGHT-POINTING TRIANGLE */ - 0x25B8, /* BLACK RIGHT-POINTING SMALL TRIANGLE */ - 0x25B9, /* WHITE RIGHT-POINTING SMALL TRIANGLE */ - 0x25BA, /* BLACK RIGHT-POINTING POINTER */ - 0x25BB, /* WHITE RIGHT-POINTING POINTER */ - 0x25BC, /* BLACK DOWN-POINTING TRIANGLE */ - 0x25BD, /* WHITE DOWN-POINTING TRIANGLE */ - 0x25BE, /* BLACK DOWN-POINTING SMALL TRIANGLE */ - 0x25BF, /* WHITE DOWN-POINTING SMALL TRIANGLE */ - 0x25C0, /* BLACK LEFT-POINTING TRIANGLE */ - 0x25C1, /* WHITE LEFT-POINTING TRIANGLE */ - 0x25C2, /* BLACK LEFT-POINTING SMALL TRIANGLE */ - 0x25C3, /* WHITE LEFT-POINTING SMALL TRIANGLE */ - 0x25C4, /* BLACK LEFT-POINTING POINTER */ - 0x25C5, /* WHITE LEFT-POINTING POINTER */ - 0x25C6, /* BLACK DIAMOND */ - 0x25C7, /* WHITE DIAMOND */ - 0x25C8, /* WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND */ - 0x25C9, /* FISHEYE */ - 0x25CA, /* LOZENGE */ - 0x25CB, /* WHITE CIRCLE */ - 0x25CC, /* DOTTED CIRCLE */ - 0x25CD, /* CIRCLE WITH VERTICAL FILL */ - 0x25CE, /* BULLSEYE */ - 0x25CF, /* BLACK CIRCLE */ - 0x25D0, /* CIRCLE WITH LEFT HALF BLACK */ - 0x25D1, /* CIRCLE WITH RIGHT HALF BLACK */ - 0x25D2, /* CIRCLE WITH LOWER HALF BLACK */ - 0x25D3, /* CIRCLE WITH UPPER HALF BLACK */ - 0x25D4, /* CIRCLE WITH UPPER RIGHT QUADRANT BLACK */ - 0x25D5, /* CIRCLE WITH ALL BUT UPPER LEFT QUADRANT BLACK */ - 0x25D6, /* LEFT HALF BLACK CIRCLE */ - 0x25D7, /* RIGHT HALF BLACK CIRCLE */ - 0x25D8, /* INVERSE BULLET */ - 0x25D9, /* INVERSE WHITE CIRCLE */ - 0x25DA, /* UPPER HALF INVERSE WHITE CIRCLE */ - 0x25DB, /* LOWER HALF INVERSE WHITE CIRCLE */ - 0x25DC, /* UPPER LEFT QUADRANT CIRCULAR ARC */ - 0x25DD, /* UPPER RIGHT QUADRANT CIRCULAR ARC */ - 0x25DE, /* LOWER RIGHT QUADRANT CIRCULAR ARC */ - 0x25DF, /* LOWER LEFT QUADRANT CIRCULAR ARC */ - 0x25E0, /* UPPER HALF CIRCLE */ - 0x25E1, /* LOWER HALF CIRCLE */ - 0x25E2, /* BLACK LOWER RIGHT TRIANGLE */ - 0x25E3, /* BLACK LOWER LEFT TRIANGLE */ - 0x25E4, /* BLACK UPPER LEFT TRIANGLE */ - 0x25E5, /* BLACK UPPER RIGHT TRIANGLE */ - 0x25E6, /* WHITE BULLET */ - 0x25E7, /* SQUARE WITH LEFT HALF BLACK */ - 0x25E8, /* SQUARE WITH RIGHT HALF BLACK */ - 0x25E9, /* SQUARE WITH UPPER LEFT DIAGONAL HALF BLACK */ - 0x25EA, /* SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK */ - 0x25EB, /* WHITE SQUARE WITH VERTICAL BISECTING LINE */ - 0x25EC, /* WHITE UP-POINTING TRIANGLE WITH DOT */ - 0x25ED, /* UP-POINTING TRIANGLE WITH LEFT HALF BLACK */ - 0x25EE, /* UP-POINTING TRIANGLE WITH RIGHT HALF BLACK */ - 0x25EF, /* LARGE CIRCLE */ - 0x25F0, /* WHITE SQUARE WITH UPPER LEFT QUADRANT */ - 0x25F1, /* WHITE SQUARE WITH LOWER LEFT QUADRANT */ - 0x25F2, /* WHITE SQUARE WITH LOWER RIGHT QUADRANT */ - 0x25F3, /* WHITE SQUARE WITH UPPER RIGHT QUADRANT */ - 0x25F4, /* WHITE CIRCLE WITH UPPER LEFT QUADRANT */ - 0x25F5, /* WHITE CIRCLE WITH LOWER LEFT QUADRANT */ - 0x25F6, /* WHITE CIRCLE WITH LOWER RIGHT QUADRANT */ - 0x25F7, /* WHITE CIRCLE WITH UPPER RIGHT QUADRANT */ - 0x25F8, /* UPPER LEFT TRIANGLE */ - 0x25F9, /* UPPER RIGHT TRIANGLE */ - 0x25FA, /* LOWER LEFT TRIANGLE */ - 0x25FB, /* WHITE MEDIUM SQUARE */ - 0x25FC, /* BLACK MEDIUM SQUARE */ - 0x25FD, /* WHITE MEDIUM SMALL SQUARE */ - 0x25FE, /* BLACK MEDIUM SMALL SQUARE */ - 0x25FF, /* LOWER RIGHT TRIANGLE */ 0x2E2E, /* REVERSED QUESTION MARK;Po;0;ON;;;;;N;;;;; */ 0x3000, /* IDEOGRAPHIC SPACE*/ 0x3002, /* IDEOGRAPHIC FULL STOP*/