new unac approach for japanese: dont decompose at all

This commit is contained in:
dockes 2009-01-06 18:40:41 +00:00
parent 1c74414c12
commit 0fc81d26b6
3 changed files with 327 additions and 718 deletions

View File

@ -72,8 +72,24 @@ sub main {
if($character_name =~ /^<(.*), (First|Last)>/) { if($character_name =~ /^<(.*), (First|Last)>/) {
$ranges{$1}{$2} = $code_value; $ranges{$1}{$2} = $code_value;
} }
# For kana japanese characters, we don't want to strip accents as I'm
# told that they are essential and stripping them does not
# make sense. Wonder why Unicode does these decompositions
# then... Problem: the first solution used was to decompose
# the japanese accented kana and not remove accents. But then
# the unaccented character would match the string with
# accent. So now we don't decompose at all, but this means
# that, if the original text was decomposed, things don't work
# as intended as we should actually recombine the
# letter+accents in this case for data to be unified.
if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) { if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
$decomposition{$code_value} = $2; # Not for Hiragana + Katakana
if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
# and Halfwidth katakana
!(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
$decomposition{$code_value} = $2;
}
} }
if($general_category =~ /^M/) { if($general_category =~ /^M/) {
$mark{$code_value} = 1; $mark{$code_value} = 1;
@ -82,20 +98,16 @@ sub main {
} }
close(FILE); close(FILE);
#
# Generate compatibility decomposition and strip marks # Generate compatibility decomposition and strip marks
# (marks == diacritics == accents) # (marks == diacritics == accents)
# #
# For kana japanese characters, we don't strip accents. Note: we just # We also forbid any excursion out of the basic plane.
# need to test for the main kana (hiragana + katakana 3040-30ff) block,
# characters such as halfwidth variations will be first decomposed into it
#
# We also forbid any excursion out of the basic plane. Sorry, Dave.
my($from, $to); my($from, $to);
while(($from, $to) = each(%decomposition)) { while(($from, $to) = each(%decomposition)) {
my(@code_values) = split(' ', $to); my(@code_values) = split(' ', $to);
my($code_value); my($code_value);
my(@decomposition); my(@decomposition);
while(@code_values) { while(@code_values) {
my($code_value) = shift(@code_values); my($code_value) = shift(@code_values);
if (hex $code_value > 0xffff) { if (hex $code_value > 0xffff) {
@ -104,8 +116,7 @@ sub main {
} }
if(exists($decomposition{$code_value})) { if(exists($decomposition{$code_value})) {
push(@code_values, split(' ', $decomposition{$code_value})); push(@code_values, split(' ', $decomposition{$code_value}));
} elsif (!exists($mark{$code_value}) || } elsif (!exists($mark{$code_value})) {
(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)) {
push(@decomposition, $code_value); push(@decomposition, $code_value);
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ extern "C" {
#define UNAC_BLOCK_SHIFT 4 #define UNAC_BLOCK_SHIFT 4
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1) #define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT) #define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
#define UNAC_BLOCK_COUNT 368 #define UNAC_BLOCK_COUNT 355
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT) #define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
/* Generated by builder. Do not modify. End defines */ /* Generated by builder. Do not modify. End defines */
@ -518,19 +518,6 @@ extern unsigned short unac_data351[];
extern unsigned short unac_data352[]; extern unsigned short unac_data352[];
extern unsigned short unac_data353[]; extern unsigned short unac_data353[];
extern unsigned short unac_data354[]; extern unsigned short unac_data354[];
extern unsigned short unac_data355[];
extern unsigned short unac_data356[];
extern unsigned short unac_data357[];
extern unsigned short unac_data358[];
extern unsigned short unac_data359[];
extern unsigned short unac_data360[];
extern unsigned short unac_data361[];
extern unsigned short unac_data362[];
extern unsigned short unac_data363[];
extern unsigned short unac_data364[];
extern unsigned short unac_data365[];
extern unsigned short unac_data366[];
extern unsigned short unac_data367[];
/* Generated by builder. Do not modify. End declarations */ /* Generated by builder. Do not modify. End declarations */
#ifdef __cplusplus #ifdef __cplusplus