new unac approach for japanese: dont decompose at all

2009-01-06 18:40:41 +00:00 · 2009-01-06 18:40:41 +00:00 · 0fc81d26b6
commit 0fc81d26b6
parent 1c74414c12
3 changed files with 327 additions and 718 deletions
--- a/unac/builder.in
+++ b/unac/builder.in
@ -72,8 +72,24 @@ sub main {
 	if($character_name =~ /^<(.*), (First|Last)>/) {
 	    $ranges{$1}{$2} = $code_value;
 	}
+
+	# For kana japanese characters, we don't want to strip accents as I'm
+	# told that they are essential and stripping them does not
+	# make sense. Wonder why Unicode does these decompositions
+	# then...  Problem: the first solution used was to decompose
+	# the japanese accented kana and not remove accents. But then
+	# the unaccented character would match the string with
+	# accent. So now we don't decompose at all, but this means
+	# that, if the original text was decomposed, things don't work
+	# as intended as we should actually recombine the
+	# letter+accents in this case for data to be unified.
 	if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
-	    $decomposition{$code_value} = $2;
+	    # Not for Hiragana + Katakana 
+	    if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
+		# and Halfwidth katakana
+		!(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
+		$decomposition{$code_value} = $2;
+	    }
 	}
 	if($general_category =~ /^M/) {
 	    $mark{$code_value} = 1;
@ -82,20 +98,16 @@ sub main {
    }
    close(FILE);
    
-    #
    # Generate compatibility decomposition and strip marks
    # (marks == diacritics == accents)
    #
-    # For kana japanese characters, we don't strip accents. Note: we just
-    # need to test for the main kana (hiragana + katakana 3040-30ff) block,
-    # characters such as halfwidth variations will be first decomposed into it
-    #
-    # We also forbid any excursion out of the basic plane. Sorry, Dave.
+    # We also forbid any excursion out of the basic plane. 
    my($from, $to);
    while(($from, $to) = each(%decomposition)) {
 	my(@code_values) = split(' ', $to);
 	my($code_value);
 	my(@decomposition);
+
 	while(@code_values) {
 	    my($code_value) = shift(@code_values);
 	    if (hex $code_value > 0xffff) {
@ -104,8 +116,7 @@ sub main {
 	    }
 	    if(exists($decomposition{$code_value})) {
 		push(@code_values, split(' ', $decomposition{$code_value}));
-	    } elsif (!exists($mark{$code_value}) || 
-		     (hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)) {
+	    } elsif (!exists($mark{$code_value})) {
 		push(@decomposition, $code_value);
 	    }
 	}
--- a/unac/unac.c
+++ b/unac/unac.c
--- a/unac/unac.h
+++ b/unac/unac.h
@ -35,7 +35,7 @@ extern "C" {
 #define UNAC_BLOCK_SHIFT 4
 #define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
 #define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
-#define UNAC_BLOCK_COUNT 368
+#define UNAC_BLOCK_COUNT 355
 #define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
 /* Generated by builder. Do not modify. End defines */

@ -518,19 +518,6 @@ extern unsigned short unac_data351[];
 extern unsigned short unac_data352[];
 extern unsigned short unac_data353[];
 extern unsigned short unac_data354[];
-extern unsigned short unac_data355[];
-extern unsigned short unac_data356[];
-extern unsigned short unac_data357[];
-extern unsigned short unac_data358[];
-extern unsigned short unac_data359[];
-extern unsigned short unac_data360[];
-extern unsigned short unac_data361[];
-extern unsigned short unac_data362[];
-extern unsigned short unac_data363[];
-extern unsigned short unac_data364[];
-extern unsigned short unac_data365[];
-extern unsigned short unac_data366[];
-extern unsigned short unac_data367[];
 /* Generated by builder. Do not modify. End declarations */

 #ifdef __cplusplus