new unac approach for japanese: dont decompose at all
This commit is contained in:
parent
1c74414c12
commit
0fc81d26b6
@ -72,8 +72,24 @@ sub main {
|
||||
if($character_name =~ /^<(.*), (First|Last)>/) {
|
||||
$ranges{$1}{$2} = $code_value;
|
||||
}
|
||||
|
||||
# For kana japanese characters, we don't want to strip accents as I'm
|
||||
# told that they are essential and stripping them does not
|
||||
# make sense. Wonder why Unicode does these decompositions
|
||||
# then... Problem: the first solution used was to decompose
|
||||
# the japanese accented kana and not remove accents. But then
|
||||
# the unaccented character would match the string with
|
||||
# accent. So now we don't decompose at all, but this means
|
||||
# that, if the original text was decomposed, things don't work
|
||||
# as intended as we should actually recombine the
|
||||
# letter+accents in this case for data to be unified.
|
||||
if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
|
||||
$decomposition{$code_value} = $2;
|
||||
# Not for Hiragana + Katakana
|
||||
if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
|
||||
# and Halfwidth katakana
|
||||
!(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
|
||||
$decomposition{$code_value} = $2;
|
||||
}
|
||||
}
|
||||
if($general_category =~ /^M/) {
|
||||
$mark{$code_value} = 1;
|
||||
@ -82,20 +98,16 @@ sub main {
|
||||
}
|
||||
close(FILE);
|
||||
|
||||
#
|
||||
# Generate compatibility decomposition and strip marks
|
||||
# (marks == diacritics == accents)
|
||||
#
|
||||
# For kana japanese characters, we don't strip accents. Note: we just
|
||||
# need to test for the main kana (hiragana + katakana 3040-30ff) block,
|
||||
# characters such as halfwidth variations will be first decomposed into it
|
||||
#
|
||||
# We also forbid any excursion out of the basic plane. Sorry, Dave.
|
||||
# We also forbid any excursion out of the basic plane.
|
||||
my($from, $to);
|
||||
while(($from, $to) = each(%decomposition)) {
|
||||
my(@code_values) = split(' ', $to);
|
||||
my($code_value);
|
||||
my(@decomposition);
|
||||
|
||||
while(@code_values) {
|
||||
my($code_value) = shift(@code_values);
|
||||
if (hex $code_value > 0xffff) {
|
||||
@ -104,8 +116,7 @@ sub main {
|
||||
}
|
||||
if(exists($decomposition{$code_value})) {
|
||||
push(@code_values, split(' ', $decomposition{$code_value}));
|
||||
} elsif (!exists($mark{$code_value}) ||
|
||||
(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)) {
|
||||
} elsif (!exists($mark{$code_value})) {
|
||||
push(@decomposition, $code_value);
|
||||
}
|
||||
}
|
||||
|
||||
1001
unac/unac.c
1001
unac/unac.c
File diff suppressed because it is too large
Load Diff
15
unac/unac.h
15
unac/unac.h
@ -35,7 +35,7 @@ extern "C" {
|
||||
#define UNAC_BLOCK_SHIFT 4
|
||||
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
|
||||
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
|
||||
#define UNAC_BLOCK_COUNT 368
|
||||
#define UNAC_BLOCK_COUNT 355
|
||||
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
|
||||
/* Generated by builder. Do not modify. End defines */
|
||||
|
||||
@ -518,19 +518,6 @@ extern unsigned short unac_data351[];
|
||||
extern unsigned short unac_data352[];
|
||||
extern unsigned short unac_data353[];
|
||||
extern unsigned short unac_data354[];
|
||||
extern unsigned short unac_data355[];
|
||||
extern unsigned short unac_data356[];
|
||||
extern unsigned short unac_data357[];
|
||||
extern unsigned short unac_data358[];
|
||||
extern unsigned short unac_data359[];
|
||||
extern unsigned short unac_data360[];
|
||||
extern unsigned short unac_data361[];
|
||||
extern unsigned short unac_data362[];
|
||||
extern unsigned short unac_data363[];
|
||||
extern unsigned short unac_data364[];
|
||||
extern unsigned short unac_data365[];
|
||||
extern unsigned short unac_data366[];
|
||||
extern unsigned short unac_data367[];
|
||||
/* Generated by builder. Do not modify. End declarations */
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user