small fix : remove diaeresis from seps + comments
This commit is contained in:
parent
5d144d6145
commit
3991b11d2b
@ -19,17 +19,15 @@
|
||||
/* @(#$Id: uproplist.h,v 1.3 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
|
||||
/*
|
||||
* A subset of Unicode chars that we consider whitespace when we split text in
|
||||
* words.
|
||||
|
||||
/**
|
||||
* A subset of Unicode chars that we consider word breaks when we
|
||||
* split text in words.
|
||||
*
|
||||
* This is used as a quick fix to the ascii-based code, and is not correct.
|
||||
* the correct way would be to do what http://www.unicode.org/reports/tr29/
|
||||
* says. We should then convert first to ucs-4, and then strictly use
|
||||
* character properties, which might actually be simpler than the current
|
||||
* solution...
|
||||
* says.
|
||||
*
|
||||
* From:
|
||||
* Data from:
|
||||
# PropList-4.0.1.txt
|
||||
# Date: 2004-03-02, 02:42:40 GMT [MD]
|
||||
#
|
||||
@ -48,9 +46,7 @@ static const unsigned int uniign[] = {
|
||||
0x00A5, /* YEN SIGN;Sc; */
|
||||
0x00A6, /* BROKEN BAR;So */
|
||||
0x00A7, /* SECTION SIGN;So; */
|
||||
0x00A8, /* DIAERESIS;Sk; */
|
||||
0x00A9, /* COPYRIGHT SIGN;So */
|
||||
0x00AA, /* FEMININE ORDINAL INDICATOR;Ll */
|
||||
0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK;Pi */
|
||||
0x00AC, /* NOT SIGN;Sm */
|
||||
0x00AE, /* registered sign */
|
||||
@ -160,6 +156,9 @@ static const unsigned int uniign[] = {
|
||||
0xFF1F, /* ; Terminal_Punctuation # Po FULLWIDTH QUESTION MARK*/
|
||||
0xFF61, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/
|
||||
0xFF64, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA*/
|
||||
|
||||
/* STerm means Sentence Terminal. Some of these are in Terminal_Punctuation
|
||||
but not all ?? */
|
||||
0x0021, /* ; STerm # Po EXCLAMATION MARK*/
|
||||
0x002E, /* ; STerm # Po FULL STOP*/
|
||||
0x003F, /* ; STerm # Po QUESTION MARK*/
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user