Make unac suppress combining accents found in input. Input in decomposed form was previously not unaccented
This commit is contained in:
parent
ea61e85b8f
commit
0d24b5620b
3164
src/unac/unac.c
3164
src/unac/unac.c
File diff suppressed because it is too large
Load Diff
@ -35,7 +35,7 @@ extern "C" {
|
||||
#define UNAC_BLOCK_SHIFT 4
|
||||
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
|
||||
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
|
||||
#define UNAC_BLOCK_COUNT 355
|
||||
#define UNAC_BLOCK_COUNT 418
|
||||
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
|
||||
/* Generated by builder. Do not modify. End defines */
|
||||
|
||||
@ -521,6 +521,69 @@ extern unsigned short unac_data351[];
|
||||
extern unsigned short unac_data352[];
|
||||
extern unsigned short unac_data353[];
|
||||
extern unsigned short unac_data354[];
|
||||
extern unsigned short unac_data355[];
|
||||
extern unsigned short unac_data356[];
|
||||
extern unsigned short unac_data357[];
|
||||
extern unsigned short unac_data358[];
|
||||
extern unsigned short unac_data359[];
|
||||
extern unsigned short unac_data360[];
|
||||
extern unsigned short unac_data361[];
|
||||
extern unsigned short unac_data362[];
|
||||
extern unsigned short unac_data363[];
|
||||
extern unsigned short unac_data364[];
|
||||
extern unsigned short unac_data365[];
|
||||
extern unsigned short unac_data366[];
|
||||
extern unsigned short unac_data367[];
|
||||
extern unsigned short unac_data368[];
|
||||
extern unsigned short unac_data369[];
|
||||
extern unsigned short unac_data370[];
|
||||
extern unsigned short unac_data371[];
|
||||
extern unsigned short unac_data372[];
|
||||
extern unsigned short unac_data373[];
|
||||
extern unsigned short unac_data374[];
|
||||
extern unsigned short unac_data375[];
|
||||
extern unsigned short unac_data376[];
|
||||
extern unsigned short unac_data377[];
|
||||
extern unsigned short unac_data378[];
|
||||
extern unsigned short unac_data379[];
|
||||
extern unsigned short unac_data380[];
|
||||
extern unsigned short unac_data381[];
|
||||
extern unsigned short unac_data382[];
|
||||
extern unsigned short unac_data383[];
|
||||
extern unsigned short unac_data384[];
|
||||
extern unsigned short unac_data385[];
|
||||
extern unsigned short unac_data386[];
|
||||
extern unsigned short unac_data387[];
|
||||
extern unsigned short unac_data388[];
|
||||
extern unsigned short unac_data389[];
|
||||
extern unsigned short unac_data390[];
|
||||
extern unsigned short unac_data391[];
|
||||
extern unsigned short unac_data392[];
|
||||
extern unsigned short unac_data393[];
|
||||
extern unsigned short unac_data394[];
|
||||
extern unsigned short unac_data395[];
|
||||
extern unsigned short unac_data396[];
|
||||
extern unsigned short unac_data397[];
|
||||
extern unsigned short unac_data398[];
|
||||
extern unsigned short unac_data399[];
|
||||
extern unsigned short unac_data400[];
|
||||
extern unsigned short unac_data401[];
|
||||
extern unsigned short unac_data402[];
|
||||
extern unsigned short unac_data403[];
|
||||
extern unsigned short unac_data404[];
|
||||
extern unsigned short unac_data405[];
|
||||
extern unsigned short unac_data406[];
|
||||
extern unsigned short unac_data407[];
|
||||
extern unsigned short unac_data408[];
|
||||
extern unsigned short unac_data409[];
|
||||
extern unsigned short unac_data410[];
|
||||
extern unsigned short unac_data411[];
|
||||
extern unsigned short unac_data412[];
|
||||
extern unsigned short unac_data413[];
|
||||
extern unsigned short unac_data414[];
|
||||
extern unsigned short unac_data415[];
|
||||
extern unsigned short unac_data416[];
|
||||
extern unsigned short unac_data417[];
|
||||
/* Generated by builder. Do not modify. End declarations */
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -93,6 +93,14 @@ sub main {
|
||||
}
|
||||
if($general_category =~ /^M/) {
|
||||
$mark{$code_value} = 1;
|
||||
# For mark caracters, we generate a 0 entry in the
|
||||
# decomposition table. This signals to the c code that no
|
||||
# output should be generated. Slightly hacky but ok. The
|
||||
# original code left mark character go through (generating
|
||||
# still accented output if the input was in decomposed
|
||||
# form). Decomposed text is rare, but, for example, macosx file
|
||||
# names have separate combining accent characters.
|
||||
$decomposition{$code_value} = "0000";
|
||||
}
|
||||
$name{$code_value} = $character_name;
|
||||
}
|
||||
@ -114,11 +122,16 @@ sub main {
|
||||
undef @decomposition;
|
||||
last;
|
||||
}
|
||||
if(exists($decomposition{$code_value})) {
|
||||
push(@code_values, split(' ', $decomposition{$code_value}));
|
||||
} elsif (!exists($mark{$code_value})) {
|
||||
push(@decomposition, $code_value);
|
||||
}
|
||||
# marks also have entries in the decomposition table (so that
|
||||
# they can be suppressed when found in input), but no output
|
||||
# component should be generated for them.
|
||||
if (!exists($mark{$code_value})) {
|
||||
if(exists($decomposition{$code_value})) {
|
||||
push(@code_values, split(' ', $decomposition{$code_value}));
|
||||
} else {
|
||||
push(@decomposition, $code_value);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(@decomposition) {
|
||||
$decomposition{$from} = "@decomposition";
|
||||
|
||||
3164
unac/unac.c
3164
unac/unac.c
File diff suppressed because it is too large
Load Diff
65
unac/unac.h
65
unac/unac.h
@ -35,7 +35,7 @@ extern "C" {
|
||||
#define UNAC_BLOCK_SHIFT 4
|
||||
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
|
||||
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
|
||||
#define UNAC_BLOCK_COUNT 355
|
||||
#define UNAC_BLOCK_COUNT 418
|
||||
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
|
||||
/* Generated by builder. Do not modify. End defines */
|
||||
|
||||
@ -521,6 +521,69 @@ extern unsigned short unac_data351[];
|
||||
extern unsigned short unac_data352[];
|
||||
extern unsigned short unac_data353[];
|
||||
extern unsigned short unac_data354[];
|
||||
extern unsigned short unac_data355[];
|
||||
extern unsigned short unac_data356[];
|
||||
extern unsigned short unac_data357[];
|
||||
extern unsigned short unac_data358[];
|
||||
extern unsigned short unac_data359[];
|
||||
extern unsigned short unac_data360[];
|
||||
extern unsigned short unac_data361[];
|
||||
extern unsigned short unac_data362[];
|
||||
extern unsigned short unac_data363[];
|
||||
extern unsigned short unac_data364[];
|
||||
extern unsigned short unac_data365[];
|
||||
extern unsigned short unac_data366[];
|
||||
extern unsigned short unac_data367[];
|
||||
extern unsigned short unac_data368[];
|
||||
extern unsigned short unac_data369[];
|
||||
extern unsigned short unac_data370[];
|
||||
extern unsigned short unac_data371[];
|
||||
extern unsigned short unac_data372[];
|
||||
extern unsigned short unac_data373[];
|
||||
extern unsigned short unac_data374[];
|
||||
extern unsigned short unac_data375[];
|
||||
extern unsigned short unac_data376[];
|
||||
extern unsigned short unac_data377[];
|
||||
extern unsigned short unac_data378[];
|
||||
extern unsigned short unac_data379[];
|
||||
extern unsigned short unac_data380[];
|
||||
extern unsigned short unac_data381[];
|
||||
extern unsigned short unac_data382[];
|
||||
extern unsigned short unac_data383[];
|
||||
extern unsigned short unac_data384[];
|
||||
extern unsigned short unac_data385[];
|
||||
extern unsigned short unac_data386[];
|
||||
extern unsigned short unac_data387[];
|
||||
extern unsigned short unac_data388[];
|
||||
extern unsigned short unac_data389[];
|
||||
extern unsigned short unac_data390[];
|
||||
extern unsigned short unac_data391[];
|
||||
extern unsigned short unac_data392[];
|
||||
extern unsigned short unac_data393[];
|
||||
extern unsigned short unac_data394[];
|
||||
extern unsigned short unac_data395[];
|
||||
extern unsigned short unac_data396[];
|
||||
extern unsigned short unac_data397[];
|
||||
extern unsigned short unac_data398[];
|
||||
extern unsigned short unac_data399[];
|
||||
extern unsigned short unac_data400[];
|
||||
extern unsigned short unac_data401[];
|
||||
extern unsigned short unac_data402[];
|
||||
extern unsigned short unac_data403[];
|
||||
extern unsigned short unac_data404[];
|
||||
extern unsigned short unac_data405[];
|
||||
extern unsigned short unac_data406[];
|
||||
extern unsigned short unac_data407[];
|
||||
extern unsigned short unac_data408[];
|
||||
extern unsigned short unac_data409[];
|
||||
extern unsigned short unac_data410[];
|
||||
extern unsigned short unac_data411[];
|
||||
extern unsigned short unac_data412[];
|
||||
extern unsigned short unac_data413[];
|
||||
extern unsigned short unac_data414[];
|
||||
extern unsigned short unac_data415[];
|
||||
extern unsigned short unac_data416[];
|
||||
extern unsigned short unac_data417[];
|
||||
/* Generated by builder. Do not modify. End declarations */
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user