14
14
# - DerivedNormalizationProps.txt
15
15
# - NormalizationTest.txt
16
16
# - UnicodeData.txt
17
+ # - StandardizedVariants.txt
17
18
#
18
19
# Since this should not require frequent updates, we just store this
19
- # out-of-line and check the unicode .rs file into git.
20
+ # out-of-line and check the tables .rs and normalization_tests.rs files into git.
20
21
import collections
21
22
import urllib .request
22
23
57
58
'Cc' : ['C' ], 'Cf' : ['C' ], 'Cs' : ['C' ], 'Co' : ['C' ], 'Cn' : ['C' ],
58
59
}
59
60
61
+ # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
62
+ # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
63
+ S_BASE , L_COUNT , V_COUNT , T_COUNT = 0xAC00 , 19 , 21 , 28
64
+ S_COUNT = L_COUNT * V_COUNT * T_COUNT
65
+
60
66
class UnicodeData (object ):
61
67
def __init__ (self ):
62
68
self ._load_unicode_data ()
@@ -66,6 +72,9 @@ def __init__(self):
66
72
self .canon_comp = self ._compute_canonical_comp ()
67
73
self .canon_fully_decomp , self .compat_fully_decomp = self ._compute_fully_decomposed ()
68
74
75
+ self .cjk_compat_variants_fully_decomp = {}
76
+ self ._load_cjk_compat_ideograph_variants ()
77
+
69
78
def stats (name , table ):
70
79
count = sum (len (v ) for v in table .values ())
71
80
print ("%s: %d chars => %d decomposed chars" % (name , len (table ), count ))
@@ -75,6 +84,7 @@ def stats(name, table):
75
84
stats ("Compatible decomp" , self .compat_decomp )
76
85
stats ("Canonical fully decomp" , self .canon_fully_decomp )
77
86
stats ("Compatible fully decomp" , self .compat_fully_decomp )
87
+ stats ("CJK Compat Variants fully decomp" , self .cjk_compat_variants_fully_decomp )
78
88
79
89
self .ss_leading , self .ss_trailing = self ._compute_stream_safe_tables ()
80
90
@@ -83,6 +93,7 @@ def _fetch(self, filename):
83
93
return resp .read ().decode ('utf-8' )
84
94
85
95
def _load_unicode_data (self ):
96
+ self .name_to_char_int = {}
86
97
self .combining_classes = {}
87
98
self .compat_decomp = {}
88
99
self .canon_decomp = {}
@@ -95,6 +106,9 @@ def _load_unicode_data(self):
95
106
char , category , cc , decomp = pieces [0 ], pieces [2 ], pieces [3 ], pieces [5 ]
96
107
char_int = int (char , 16 )
97
108
109
+ name = pieces [1 ].strip ()
110
+ self .name_to_char_int [name ] = char_int
111
+
98
112
if cc != '0' :
99
113
self .combining_classes [char_int ] = cc
100
114
@@ -106,6 +120,41 @@ def _load_unicode_data(self):
106
120
if category == 'M' or 'M' in expanded_categories .get (category , []):
107
121
self .general_category_mark .append (char_int )
108
122
123
+ def _load_cjk_compat_ideograph_variants (self ):
124
+ for line in self ._fetch ("StandardizedVariants.txt" ).splitlines ():
125
+ strip_comments = line .split ('#' , 1 )[0 ].strip ()
126
+ if not strip_comments :
127
+ continue
128
+
129
+ variation_sequence , description , differences = strip_comments .split (';' )
130
+ description = description .strip ()
131
+
132
+ # Don't use variations that only apply in particular shaping environments.
133
+ if differences :
134
+ continue
135
+
136
+ # Look for entries where the description field is a codepoint name.
137
+ if description not in self .name_to_char_int :
138
+ continue
139
+
140
+ # Only consider the CJK Compatibility Ideographs.
141
+ if not description .startswith ('CJK COMPATIBILITY IDEOGRAPH-' ):
142
+ continue
143
+
144
+ char_int = self .name_to_char_int [description ]
145
+
146
+ assert not char_int in self .combining_classes , "Unexpected: CJK compat variant with a combining class"
147
+ assert not char_int in self .compat_decomp , "Unexpected: CJK compat variant and compatibility decomposition"
148
+ assert len (self .canon_decomp [char_int ]) == 1 , "Unexpected: CJK compat variant and non-singleton canonical decomposition"
149
+ # If we ever need to handle Hangul here, we'll need to handle it separately.
150
+ assert not (S_BASE <= char_int < S_BASE + S_COUNT )
151
+
152
+ cjk_compat_variant_parts = [int (c , 16 ) for c in variation_sequence .split ()]
153
+ for c in cjk_compat_variant_parts :
154
+ assert not c in self .canon_decomp , "Unexpected: CJK compat variant is unnormalized (canon)"
155
+ assert not c in self .compat_decomp , "Unexpected: CJK compat variant is unnormalized (compat)"
156
+ self .cjk_compat_variants_fully_decomp [char_int ] = cjk_compat_variant_parts
157
+
109
158
def _load_norm_props (self ):
110
159
props = collections .defaultdict (list )
111
160
@@ -178,11 +227,6 @@ def _compute_fully_decomposed(self):
178
227
The upshot is that decomposition code is very simple and easy to inline
179
228
at mild code size cost.
180
229
"""
181
- # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
182
- # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
183
- S_BASE , L_COUNT , V_COUNT , T_COUNT = 0xAC00 , 19 , 21 , 28
184
- S_COUNT = L_COUNT * V_COUNT * T_COUNT
185
-
186
230
def _decompose (char_int , compatible ):
187
231
# 7-bit ASCII never decomposes
188
232
if char_int <= 0x7f :
@@ -320,8 +364,8 @@ def gen_composition_table(canon_comp, out):
320
364
out .write (" }\n " )
321
365
out .write ("}\n " )
322
366
323
- def gen_decomposition_tables (canon_decomp , compat_decomp , out ):
324
- tables = [(canon_decomp , 'canonical' ), (compat_decomp , 'compatibility' )]
367
+ def gen_decomposition_tables (canon_decomp , compat_decomp , cjk_compat_variants_decomp , out ):
368
+ tables = [(canon_decomp , 'canonical' ), (compat_decomp , 'compatibility' ), ( cjk_compat_variants_decomp , 'cjk_compat_variants' ) ]
325
369
for table , name in tables :
326
370
gen_mph_data (name + '_decomposed' , table , "(u32, &'static [char])" ,
327
371
lambda k : "(0x{:x}, &[{}])" .format (k ,
@@ -491,7 +535,7 @@ def minimal_perfect_hash(d):
491
535
gen_composition_table (data .canon_comp , out )
492
536
out .write ("\n " )
493
537
494
- gen_decomposition_tables (data .canon_fully_decomp , data .compat_fully_decomp , out )
538
+ gen_decomposition_tables (data .canon_fully_decomp , data .compat_fully_decomp , data . cjk_compat_variants_fully_decomp , out )
495
539
496
540
gen_combining_mark (data .general_category_mark , out )
497
541
out .write ("\n " )
0 commit comments