use unicode 5.1.0 + dont unaccent katakana/hiragana. Main change in unicode is that letters ae and o with stroke dont decompose anymore into a+e and o+e we may actually want to restore this if it proves a problem
This commit is contained in:
parent
a4dda86ed9
commit
869d75ee03
@ -82,6 +82,9 @@ sub main {
|
||||
# Generate compatibility decomposition and strip marks
|
||||
# (marks == diacritics == accents)
|
||||
#
|
||||
# For kana japanese characters, we don't strip accents. Note: we just
|
||||
# need to test for the main kana (hiragana + katakana 3040-30ff) block,
|
||||
# characters such as halfwidth variations will be first decomposed into it
|
||||
my($from, $to);
|
||||
while(($from, $to) = each(%decomposition)) {
|
||||
my(@code_values) = split(' ', $to);
|
||||
@ -91,7 +94,8 @@ sub main {
|
||||
my($code_value) = shift(@code_values);
|
||||
if(exists($decomposition{$code_value})) {
|
||||
push(@code_values, split(' ', $decomposition{$code_value}));
|
||||
} elsif(!exists($mark{$code_value})) {
|
||||
} elsif (!exists($mark{$code_value}) ||
|
||||
(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)) {
|
||||
push(@decomposition, $code_value);
|
||||
}
|
||||
}
|
||||
|
||||
6586
unac/configure
vendored
6586
unac/configure
vendored
File diff suppressed because it is too large
Load Diff
@ -33,7 +33,7 @@ AC_PROG_MAKE_SET
|
||||
dnl
|
||||
dnl Unicode version used by unac, as published at http://www.unicode.org/Public/
|
||||
dnl
|
||||
UNICODE_VERSION=3.2.0
|
||||
UNICODE_VERSION=5.1.0
|
||||
AC_SUBST(UNICODE_VERSION)
|
||||
|
||||
AC_PROG_CC
|
||||
|
||||
1834
unac/unac.c
1834
unac/unac.c
File diff suppressed because it is too large
Load Diff
55
unac/unac.h
55
unac/unac.h
@ -35,7 +35,7 @@ extern "C" {
|
||||
#define UNAC_BLOCK_SHIFT 4
|
||||
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
|
||||
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
|
||||
#define UNAC_BLOCK_COUNT 315
|
||||
#define UNAC_BLOCK_COUNT 368
|
||||
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
|
||||
/* Generated by builder. Do not modify. End defines */
|
||||
|
||||
@ -478,6 +478,59 @@ extern unsigned short unac_data311[];
|
||||
extern unsigned short unac_data312[];
|
||||
extern unsigned short unac_data313[];
|
||||
extern unsigned short unac_data314[];
|
||||
extern unsigned short unac_data315[];
|
||||
extern unsigned short unac_data316[];
|
||||
extern unsigned short unac_data317[];
|
||||
extern unsigned short unac_data318[];
|
||||
extern unsigned short unac_data319[];
|
||||
extern unsigned short unac_data320[];
|
||||
extern unsigned short unac_data321[];
|
||||
extern unsigned short unac_data322[];
|
||||
extern unsigned short unac_data323[];
|
||||
extern unsigned short unac_data324[];
|
||||
extern unsigned short unac_data325[];
|
||||
extern unsigned short unac_data326[];
|
||||
extern unsigned short unac_data327[];
|
||||
extern unsigned short unac_data328[];
|
||||
extern unsigned short unac_data329[];
|
||||
extern unsigned short unac_data330[];
|
||||
extern unsigned short unac_data331[];
|
||||
extern unsigned short unac_data332[];
|
||||
extern unsigned short unac_data333[];
|
||||
extern unsigned short unac_data334[];
|
||||
extern unsigned short unac_data335[];
|
||||
extern unsigned short unac_data336[];
|
||||
extern unsigned short unac_data337[];
|
||||
extern unsigned short unac_data338[];
|
||||
extern unsigned short unac_data339[];
|
||||
extern unsigned short unac_data340[];
|
||||
extern unsigned short unac_data341[];
|
||||
extern unsigned short unac_data342[];
|
||||
extern unsigned short unac_data343[];
|
||||
extern unsigned short unac_data344[];
|
||||
extern unsigned short unac_data345[];
|
||||
extern unsigned short unac_data346[];
|
||||
extern unsigned short unac_data347[];
|
||||
extern unsigned short unac_data348[];
|
||||
extern unsigned short unac_data349[];
|
||||
extern unsigned short unac_data350[];
|
||||
extern unsigned short unac_data351[];
|
||||
extern unsigned short unac_data352[];
|
||||
extern unsigned short unac_data353[];
|
||||
extern unsigned short unac_data354[];
|
||||
extern unsigned short unac_data355[];
|
||||
extern unsigned short unac_data356[];
|
||||
extern unsigned short unac_data357[];
|
||||
extern unsigned short unac_data358[];
|
||||
extern unsigned short unac_data359[];
|
||||
extern unsigned short unac_data360[];
|
||||
extern unsigned short unac_data361[];
|
||||
extern unsigned short unac_data362[];
|
||||
extern unsigned short unac_data363[];
|
||||
extern unsigned short unac_data364[];
|
||||
extern unsigned short unac_data365[];
|
||||
extern unsigned short unac_data366[];
|
||||
extern unsigned short unac_data367[];
|
||||
/* Generated by builder. Do not modify. End declarations */
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user