Make unac suppress combining accents found in input. Input in decomposed form was previously not unaccented

This commit is contained in:
Jean-Francois Dockes 2011-11-04 21:06:48 +01:00
parent ea61e85b8f
commit 0d24b5620b
5 changed files with 5492 additions and 989 deletions

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ extern "C" {
#define UNAC_BLOCK_SHIFT 4
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
#define UNAC_BLOCK_COUNT 355
#define UNAC_BLOCK_COUNT 418
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
/* Generated by builder. Do not modify. End defines */
@ -521,6 +521,69 @@ extern unsigned short unac_data351[];
extern unsigned short unac_data352[];
extern unsigned short unac_data353[];
extern unsigned short unac_data354[];
extern unsigned short unac_data355[];
extern unsigned short unac_data356[];
extern unsigned short unac_data357[];
extern unsigned short unac_data358[];
extern unsigned short unac_data359[];
extern unsigned short unac_data360[];
extern unsigned short unac_data361[];
extern unsigned short unac_data362[];
extern unsigned short unac_data363[];
extern unsigned short unac_data364[];
extern unsigned short unac_data365[];
extern unsigned short unac_data366[];
extern unsigned short unac_data367[];
extern unsigned short unac_data368[];
extern unsigned short unac_data369[];
extern unsigned short unac_data370[];
extern unsigned short unac_data371[];
extern unsigned short unac_data372[];
extern unsigned short unac_data373[];
extern unsigned short unac_data374[];
extern unsigned short unac_data375[];
extern unsigned short unac_data376[];
extern unsigned short unac_data377[];
extern unsigned short unac_data378[];
extern unsigned short unac_data379[];
extern unsigned short unac_data380[];
extern unsigned short unac_data381[];
extern unsigned short unac_data382[];
extern unsigned short unac_data383[];
extern unsigned short unac_data384[];
extern unsigned short unac_data385[];
extern unsigned short unac_data386[];
extern unsigned short unac_data387[];
extern unsigned short unac_data388[];
extern unsigned short unac_data389[];
extern unsigned short unac_data390[];
extern unsigned short unac_data391[];
extern unsigned short unac_data392[];
extern unsigned short unac_data393[];
extern unsigned short unac_data394[];
extern unsigned short unac_data395[];
extern unsigned short unac_data396[];
extern unsigned short unac_data397[];
extern unsigned short unac_data398[];
extern unsigned short unac_data399[];
extern unsigned short unac_data400[];
extern unsigned short unac_data401[];
extern unsigned short unac_data402[];
extern unsigned short unac_data403[];
extern unsigned short unac_data404[];
extern unsigned short unac_data405[];
extern unsigned short unac_data406[];
extern unsigned short unac_data407[];
extern unsigned short unac_data408[];
extern unsigned short unac_data409[];
extern unsigned short unac_data410[];
extern unsigned short unac_data411[];
extern unsigned short unac_data412[];
extern unsigned short unac_data413[];
extern unsigned short unac_data414[];
extern unsigned short unac_data415[];
extern unsigned short unac_data416[];
extern unsigned short unac_data417[];
/* Generated by builder. Do not modify. End declarations */
#ifdef __cplusplus

View File

@ -93,6 +93,14 @@ sub main {
}
if($general_category =~ /^M/) {
$mark{$code_value} = 1;
# For mark caracters, we generate a 0 entry in the
# decomposition table. This signals to the c code that no
# output should be generated. Slightly hacky but ok. The
# original code left mark character go through (generating
# still accented output if the input was in decomposed
# form). Decomposed text is rare, but, for example, macosx file
# names have separate combining accent characters.
$decomposition{$code_value} = "0000";
}
$name{$code_value} = $character_name;
}
@ -114,11 +122,16 @@ sub main {
undef @decomposition;
last;
}
if(exists($decomposition{$code_value})) {
push(@code_values, split(' ', $decomposition{$code_value}));
} elsif (!exists($mark{$code_value})) {
push(@decomposition, $code_value);
}
# marks also have entries in the decomposition table (so that
# they can be suppressed when found in input), but no output
# component should be generated for them.
if (!exists($mark{$code_value})) {
if(exists($decomposition{$code_value})) {
push(@code_values, split(' ', $decomposition{$code_value}));
} else {
push(@decomposition, $code_value);
}
}
}
if(@decomposition) {
$decomposition{$from} = "@decomposition";

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ extern "C" {
#define UNAC_BLOCK_SHIFT 4
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
#define UNAC_BLOCK_COUNT 355
#define UNAC_BLOCK_COUNT 418
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
/* Generated by builder. Do not modify. End defines */
@ -521,6 +521,69 @@ extern unsigned short unac_data351[];
extern unsigned short unac_data352[];
extern unsigned short unac_data353[];
extern unsigned short unac_data354[];
extern unsigned short unac_data355[];
extern unsigned short unac_data356[];
extern unsigned short unac_data357[];
extern unsigned short unac_data358[];
extern unsigned short unac_data359[];
extern unsigned short unac_data360[];
extern unsigned short unac_data361[];
extern unsigned short unac_data362[];
extern unsigned short unac_data363[];
extern unsigned short unac_data364[];
extern unsigned short unac_data365[];
extern unsigned short unac_data366[];
extern unsigned short unac_data367[];
extern unsigned short unac_data368[];
extern unsigned short unac_data369[];
extern unsigned short unac_data370[];
extern unsigned short unac_data371[];
extern unsigned short unac_data372[];
extern unsigned short unac_data373[];
extern unsigned short unac_data374[];
extern unsigned short unac_data375[];
extern unsigned short unac_data376[];
extern unsigned short unac_data377[];
extern unsigned short unac_data378[];
extern unsigned short unac_data379[];
extern unsigned short unac_data380[];
extern unsigned short unac_data381[];
extern unsigned short unac_data382[];
extern unsigned short unac_data383[];
extern unsigned short unac_data384[];
extern unsigned short unac_data385[];
extern unsigned short unac_data386[];
extern unsigned short unac_data387[];
extern unsigned short unac_data388[];
extern unsigned short unac_data389[];
extern unsigned short unac_data390[];
extern unsigned short unac_data391[];
extern unsigned short unac_data392[];
extern unsigned short unac_data393[];
extern unsigned short unac_data394[];
extern unsigned short unac_data395[];
extern unsigned short unac_data396[];
extern unsigned short unac_data397[];
extern unsigned short unac_data398[];
extern unsigned short unac_data399[];
extern unsigned short unac_data400[];
extern unsigned short unac_data401[];
extern unsigned short unac_data402[];
extern unsigned short unac_data403[];
extern unsigned short unac_data404[];
extern unsigned short unac_data405[];
extern unsigned short unac_data406[];
extern unsigned short unac_data407[];
extern unsigned short unac_data408[];
extern unsigned short unac_data409[];
extern unsigned short unac_data410[];
extern unsigned short unac_data411[];
extern unsigned short unac_data412[];
extern unsigned short unac_data413[];
extern unsigned short unac_data414[];
extern unsigned short unac_data415[];
extern unsigned short unac_data416[];
extern unsigned short unac_data417[];
/* Generated by builder. Do not modify. End declarations */
#ifdef __cplusplus