From 63d97e597b8c654e634f7a29d25fb35b49607615 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 19 Sep 2012 19:50:45 +0200 Subject: [PATCH] added a bunch of graphic characters to the word breakers list and changed the container used from set to unordered_set for speed --- src/common/textsplit.cpp | 9 ++-- src/common/uproplist.h | 97 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 3 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 1fce6c67..764b8cab 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -21,7 +21,10 @@ #include #include -#include +//#include +#include +using std::tr1::unordered_set; + #include #include "textsplit.h" @@ -57,8 +60,8 @@ static int charclasses[charclasses_size]; // with interesting properties. This is far from full-blown management // of Unicode properties, but seems to do the job well enough in most // common cases -static set unicign; -static set visiblewhite; +static unordered_set unicign; +static unordered_set visiblewhite; class CharClassInit { public: diff --git a/src/common/uproplist.h b/src/common/uproplist.h index 071dbe5f..29e99116 100644 --- a/src/common/uproplist.h +++ b/src/common/uproplist.h @@ -162,6 +162,103 @@ static const unsigned int uniign[] = { 0x2122, /* TRADE MARK SIGN;So; */ 0x2192, /* RIGHTWARDS ARROW;Sm;0;ON;;;;;N;RIGHT ARROW;;;;*/ 0x2212, /* ; Dash # Sm MINUS SIGN*/ + /* 25a0 to 25ff are geometric shapes */ + 0x25A0, /* BLACK SQUARE */ + 0x25A1, /* WHITE SQUARE */ + 0x25A2, /* WHITE SQUARE WITH ROUNDED CORNERS */ + 0x25A3, /* WHITE SQUARE CONTAINING BLACK SMALL SQUARE */ + 0x25A4, /* SQUARE WITH HORIZONTAL FILL */ + 0x25A5, /* SQUARE WITH VERTICAL FILL */ + 0x25A6, /* SQUARE WITH ORTHOGONAL CROSSHATCH FILL */ + 0x25A7, /* SQUARE WITH UPPER LEFT TO LOWER RIGHT FILL */ + 0x25A8, /* SQUARE WITH UPPER RIGHT TO LOWER LEFT FILL */ + 0x25A9, /* SQUARE WITH DIAGONAL CROSSHATCH FILL */ + 0x25AA, /* BLACK SMALL SQUARE */ + 0x25AB, /* WHITE SMALL SQUARE */ + 0x25AC, /* BLACK RECTANGLE */ + 0x25AD, /* WHITE RECTANGLE */ + 0x25AE, /* BLACK VERTICAL RECTANGLE */ + 0x25AF, /* WHITE VERTICAL RECTANGLE */ + 0x25B0, /* BLACK PARALLELOGRAM */ + 0x25B1, /* WHITE PARALLELOGRAM */ + 0x25B2, /* BLACK UP-POINTING TRIANGLE */ + 0x25B3, /* WHITE UP-POINTING TRIANGLE */ + 0x25B4, /* BLACK UP-POINTING SMALL TRIANGLE */ + 0x25B5, /* WHITE UP-POINTING SMALL TRIANGLE */ + 0x25B6, /* BLACK RIGHT-POINTING TRIANGLE */ + 0x25B7, /* WHITE RIGHT-POINTING TRIANGLE */ + 0x25B8, /* BLACK RIGHT-POINTING SMALL TRIANGLE */ + 0x25B9, /* WHITE RIGHT-POINTING SMALL TRIANGLE */ + 0x25BA, /* BLACK RIGHT-POINTING POINTER */ + 0x25BB, /* WHITE RIGHT-POINTING POINTER */ + 0x25BC, /* BLACK DOWN-POINTING TRIANGLE */ + 0x25BD, /* WHITE DOWN-POINTING TRIANGLE */ + 0x25BE, /* BLACK DOWN-POINTING SMALL TRIANGLE */ + 0x25BF, /* WHITE DOWN-POINTING SMALL TRIANGLE */ + 0x25C0, /* BLACK LEFT-POINTING TRIANGLE */ + 0x25C1, /* WHITE LEFT-POINTING TRIANGLE */ + 0x25C2, /* BLACK LEFT-POINTING SMALL TRIANGLE */ + 0x25C3, /* WHITE LEFT-POINTING SMALL TRIANGLE */ + 0x25C4, /* BLACK LEFT-POINTING POINTER */ + 0x25C5, /* WHITE LEFT-POINTING POINTER */ + 0x25C6, /* BLACK DIAMOND */ + 0x25C7, /* WHITE DIAMOND */ + 0x25C8, /* WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND */ + 0x25C9, /* FISHEYE */ + 0x25CA, /* LOZENGE */ + 0x25CB, /* WHITE CIRCLE */ + 0x25CC, /* DOTTED CIRCLE */ + 0x25CD, /* CIRCLE WITH VERTICAL FILL */ + 0x25CE, /* BULLSEYE */ + 0x25CF, /* BLACK CIRCLE */ + 0x25D0, /* CIRCLE WITH LEFT HALF BLACK */ + 0x25D1, /* CIRCLE WITH RIGHT HALF BLACK */ + 0x25D2, /* CIRCLE WITH LOWER HALF BLACK */ + 0x25D3, /* CIRCLE WITH UPPER HALF BLACK */ + 0x25D4, /* CIRCLE WITH UPPER RIGHT QUADRANT BLACK */ + 0x25D5, /* CIRCLE WITH ALL BUT UPPER LEFT QUADRANT BLACK */ + 0x25D6, /* LEFT HALF BLACK CIRCLE */ + 0x25D7, /* RIGHT HALF BLACK CIRCLE */ + 0x25D8, /* INVERSE BULLET */ + 0x25D9, /* INVERSE WHITE CIRCLE */ + 0x25DA, /* UPPER HALF INVERSE WHITE CIRCLE */ + 0x25DB, /* LOWER HALF INVERSE WHITE CIRCLE */ + 0x25DC, /* UPPER LEFT QUADRANT CIRCULAR ARC */ + 0x25DD, /* UPPER RIGHT QUADRANT CIRCULAR ARC */ + 0x25DE, /* LOWER RIGHT QUADRANT CIRCULAR ARC */ + 0x25DF, /* LOWER LEFT QUADRANT CIRCULAR ARC */ + 0x25E0, /* UPPER HALF CIRCLE */ + 0x25E1, /* LOWER HALF CIRCLE */ + 0x25E2, /* BLACK LOWER RIGHT TRIANGLE */ + 0x25E3, /* BLACK LOWER LEFT TRIANGLE */ + 0x25E4, /* BLACK UPPER LEFT TRIANGLE */ + 0x25E5, /* BLACK UPPER RIGHT TRIANGLE */ + 0x25E6, /* WHITE BULLET */ + 0x25E7, /* SQUARE WITH LEFT HALF BLACK */ + 0x25E8, /* SQUARE WITH RIGHT HALF BLACK */ + 0x25E9, /* SQUARE WITH UPPER LEFT DIAGONAL HALF BLACK */ + 0x25EA, /* SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK */ + 0x25EB, /* WHITE SQUARE WITH VERTICAL BISECTING LINE */ + 0x25EC, /* WHITE UP-POINTING TRIANGLE WITH DOT */ + 0x25ED, /* UP-POINTING TRIANGLE WITH LEFT HALF BLACK */ + 0x25EE, /* UP-POINTING TRIANGLE WITH RIGHT HALF BLACK */ + 0x25EF, /* LARGE CIRCLE */ + 0x25F0, /* WHITE SQUARE WITH UPPER LEFT QUADRANT */ + 0x25F1, /* WHITE SQUARE WITH LOWER LEFT QUADRANT */ + 0x25F2, /* WHITE SQUARE WITH LOWER RIGHT QUADRANT */ + 0x25F3, /* WHITE SQUARE WITH UPPER RIGHT QUADRANT */ + 0x25F4, /* WHITE CIRCLE WITH UPPER LEFT QUADRANT */ + 0x25F5, /* WHITE CIRCLE WITH LOWER LEFT QUADRANT */ + 0x25F6, /* WHITE CIRCLE WITH LOWER RIGHT QUADRANT */ + 0x25F7, /* WHITE CIRCLE WITH UPPER RIGHT QUADRANT */ + 0x25F8, /* UPPER LEFT TRIANGLE */ + 0x25F9, /* UPPER RIGHT TRIANGLE */ + 0x25FA, /* LOWER LEFT TRIANGLE */ + 0x25FB, /* WHITE MEDIUM SQUARE */ + 0x25FC, /* BLACK MEDIUM SQUARE */ + 0x25FD, /* WHITE MEDIUM SMALL SQUARE */ + 0x25FE, /* BLACK MEDIUM SMALL SQUARE */ + 0x25FF, /* LOWER RIGHT TRIANGLE */ 0x2E2E, /* REVERSED QUESTION MARK;Po;0;ON;;;;;N;;;;; */ 0x3000, /* ; White_Space # Zs IDEOGRAPHIC SPACE*/ 0x3002, /* ; STerm # Po IDEOGRAPHIC FULL STOP*/