use unicode 5.1.0 + dont unaccent katakana/hiragana. Main change in unicode is that letters ae and o with stroke dont decompose anymore into a+e and o+e we may actually want to restore this if it proves a problem

This commit is contained in:
dockes 2008-12-18 11:04:47 +00:00
parent a4dda86ed9
commit 869d75ee03
5 changed files with 5294 additions and 3189 deletions

View File

@ -82,6 +82,9 @@ sub main {
# Generate compatibility decomposition and strip marks
# (marks == diacritics == accents)
#
# For kana japanese characters, we don't strip accents. Note: we just
# need to test for the main kana (hiragana + katakana 3040-30ff) block,
# characters such as halfwidth variations will be first decomposed into it
my($from, $to);
while(($from, $to) = each(%decomposition)) {
my(@code_values) = split(' ', $to);
@ -91,7 +94,8 @@ sub main {
my($code_value) = shift(@code_values);
if(exists($decomposition{$code_value})) {
push(@code_values, split(' ', $decomposition{$code_value}));
} elsif(!exists($mark{$code_value})) {
} elsif (!exists($mark{$code_value}) ||
(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)) {
push(@decomposition, $code_value);
}
}

6586
unac/configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -33,7 +33,7 @@ AC_PROG_MAKE_SET
dnl
dnl Unicode version used by unac, as published at http://www.unicode.org/Public/
dnl
UNICODE_VERSION=3.2.0
UNICODE_VERSION=5.1.0
AC_SUBST(UNICODE_VERSION)
AC_PROG_CC

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ extern "C" {
#define UNAC_BLOCK_SHIFT 4
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
#define UNAC_BLOCK_COUNT 315
#define UNAC_BLOCK_COUNT 368
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
/* Generated by builder. Do not modify. End defines */
@ -478,6 +478,59 @@ extern unsigned short unac_data311[];
extern unsigned short unac_data312[];
extern unsigned short unac_data313[];
extern unsigned short unac_data314[];
extern unsigned short unac_data315[];
extern unsigned short unac_data316[];
extern unsigned short unac_data317[];
extern unsigned short unac_data318[];
extern unsigned short unac_data319[];
extern unsigned short unac_data320[];
extern unsigned short unac_data321[];
extern unsigned short unac_data322[];
extern unsigned short unac_data323[];
extern unsigned short unac_data324[];
extern unsigned short unac_data325[];
extern unsigned short unac_data326[];
extern unsigned short unac_data327[];
extern unsigned short unac_data328[];
extern unsigned short unac_data329[];
extern unsigned short unac_data330[];
extern unsigned short unac_data331[];
extern unsigned short unac_data332[];
extern unsigned short unac_data333[];
extern unsigned short unac_data334[];
extern unsigned short unac_data335[];
extern unsigned short unac_data336[];
extern unsigned short unac_data337[];
extern unsigned short unac_data338[];
extern unsigned short unac_data339[];
extern unsigned short unac_data340[];
extern unsigned short unac_data341[];
extern unsigned short unac_data342[];
extern unsigned short unac_data343[];
extern unsigned short unac_data344[];
extern unsigned short unac_data345[];
extern unsigned short unac_data346[];
extern unsigned short unac_data347[];
extern unsigned short unac_data348[];
extern unsigned short unac_data349[];
extern unsigned short unac_data350[];
extern unsigned short unac_data351[];
extern unsigned short unac_data352[];
extern unsigned short unac_data353[];
extern unsigned short unac_data354[];
extern unsigned short unac_data355[];
extern unsigned short unac_data356[];
extern unsigned short unac_data357[];
extern unsigned short unac_data358[];
extern unsigned short unac_data359[];
extern unsigned short unac_data360[];
extern unsigned short unac_data361[];
extern unsigned short unac_data362[];
extern unsigned short unac_data363[];
extern unsigned short unac_data364[];
extern unsigned short unac_data365[];
extern unsigned short unac_data366[];
extern unsigned short unac_data367[];
/* Generated by builder. Do not modify. End declarations */
#ifdef __cplusplus