@@ -80,6 +80,60 @@ def load_emoji_properties(f):
80
80
81
81
return kinds
82
82
83
+
84
+ def load_general_category_properties (f ):
85
+ fetch_unidata (f )
86
+ general_category_list = []
87
+ re1 = re .compile (r"^([0-9A-F]+);([^;]+);([A-Za-z]+);.*$" )
88
+ re2 = re .compile (r"^<(.*), First>$" )
89
+ re3 = re .compile (r"^<(.*), Last>$" )
90
+ re4 = re .compile (r"^<(.*)>$" )
91
+
92
+ special_group_lo = 0
93
+ special_group_text = ''
94
+ special_group_gc = ''
95
+ for line in fileinput .input (os .path .basename (f ), openhook = fileinput .hook_encoded ("utf-8" )):
96
+ d_ch = 0
97
+ d_name = ''
98
+ d_gc = ''
99
+ d_lo = 0
100
+ d_hi = 0
101
+ m = re1 .match (line )
102
+ if not m :
103
+ continue
104
+
105
+ d_ch = m .group (1 )
106
+ d_name = m .group (2 ).strip ()
107
+ d_gc = m .group (3 ).strip ()
108
+
109
+ if not d_name .startswith ('<' ):
110
+ d_lo = int (d_ch , 16 )
111
+ d_hi = d_lo
112
+ general_category_list .append ((d_lo , d_hi , d_gc ))
113
+ continue
114
+ m2 = re2 .match (d_name )
115
+ if m2 :
116
+ special_group_lo = int (d_ch , 16 )
117
+ special_group_text = m2 .group (1 )
118
+ special_group_gc = d_gc
119
+ continue
120
+ m3 = re3 .match (d_name )
121
+ if m3 :
122
+ assert (special_group_text == m3 .group (1 ))
123
+ assert (special_group_gc == d_gc )
124
+ d_lo = special_group_lo
125
+ d_hi = int (d_ch , 16 )
126
+ general_category_list .append ((d_lo , d_hi , d_gc ))
127
+ continue
128
+ m4 = re4 .match (d_name )
129
+ if m4 :
130
+ d_lo = int (d_ch , 16 )
131
+ d_hi = d_lo
132
+ general_category_list .append ((d_lo , d_hi , d_gc ))
133
+ continue
134
+ raise ValueError ("unreachable" )
135
+ return general_category_list
136
+
83
137
def format_table_content (f , content , indent ):
84
138
line = " " * indent
85
139
first = True
@@ -130,13 +184,200 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
130
184
format_table_content (f , data , 8 )
131
185
f .write ("\n ];\n \n " )
132
186
187
+ def emit_general_category_module (f ):
188
+ f .write ("""#[cfg(feature = \" general-category\" )]
189
+ pub mod general_category {""" )
190
+ f .write ("""
191
+
192
+ #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
193
+ pub enum GeneralCategory {
194
+ /// an uppercase letter
195
+ LetterUppercase,
196
+ /// a lowercase letter
197
+ LetterLowercase,
198
+ /// a digraphic character, with first part uppercase
199
+ LetterTitlecase,
200
+ /// a modifier letter
201
+ LetterModifier,
202
+ /// other letters, including syllables and ideographs
203
+ LetterOther,
204
+ /// a nonspacing combining mark (zero advance width)
205
+ MarkNonspacing,
206
+ /// a spacing combining mark (positive advance width)
207
+ MarkSpacing,
208
+ /// an enclosing combining mark
209
+ MarkEnclosing,
210
+ /// a decimal digit
211
+ NumberDecimal,
212
+ /// a letterlike numeric character
213
+ NumberLetter,
214
+ /// a numeric character of other type
215
+ NumberOther,
216
+ /// a connecting punctuation mark, like a tie
217
+ PunctuationConnector,
218
+ /// a dash or hyphen punctuation mark
219
+ PunctuationDash,
220
+ /// an opening punctuation mark (of a pair)
221
+ PunctuationOpen,
222
+ /// a closing punctuation mark (of a pair)
223
+ PunctuationClose,
224
+ /// an initial quotation mark
225
+ PunctuationInitial,
226
+ /// a final quotation mark
227
+ PunctuationFinal,
228
+ /// a punctuation mark of other type
229
+ PunctuationOther,
230
+ /// a symbol of mathematical use
231
+ SymbolMath,
232
+ /// a currency sign
233
+ SymbolCurrency,
234
+ /// a non-letterlike modifier symbol
235
+ SymbolModifier,
236
+ /// a symbol of other type
237
+ SymbolOther,
238
+ /// a space character (of various non-zero widths)
239
+ SeparatorSpace,
240
+ /// U+2028 LINE SEPARATOR only
241
+ SeparatorLine,
242
+ /// U+2029 PARAGRAPH SEPARATOR only
243
+ SeparatorParagraph,
244
+ /// a C0 or C1 control code
245
+ OtherControl,
246
+ /// a format control character
247
+ OtherFormat,
248
+ /// a surrogate code point
249
+ OtherSurrogate,
250
+ /// a private-use character
251
+ OtherPrivateUse,
252
+ /// a reserved unassigned code point or a noncharacter
253
+ OtherUnassigned,
254
+ }
255
+
256
+ #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
257
+ pub enum GeneralCategoryGroup {
258
+ /// Lu | Ll | Lt | Lm | Lo
259
+ Letter,
260
+ /// Mn | Mc | Me
261
+ Mark,
262
+ /// Nd | Nl | No
263
+ Number,
264
+ /// Pc | Pd | Ps | Pe | Pi | Pf | Po
265
+ Punctuation,
266
+ /// Sm | Sc | Sk | So
267
+ Symbol,
268
+ /// Zs | Zl | Zp
269
+ Separator,
270
+ /// Cc | Cf | Cs | Co | Cn
271
+ Other,
272
+ }
273
+
274
+ #[inline]
275
+ pub(crate) fn general_category_of_char(c: char) -> GeneralCategory {
276
+ match c as usize {
277
+ _ => super::util::bsearch_range_value_table(c, GENERAL_CATEGORY).unwrap_or(GeneralCategory::OtherUnassigned)
278
+ }
279
+ }
280
+
281
+ #[inline]
282
+ pub(crate) fn general_category_is_letter_cased(gc: GeneralCategory) -> bool {
283
+ matches!(gc, GeneralCategory::LetterUppercase | GeneralCategory::LetterLowercase | GeneralCategory::LetterTitlecase)
284
+ }
285
+
286
+ #[inline]
287
+ pub(crate) fn general_category_group(gc: GeneralCategory) -> GeneralCategoryGroup {
288
+ match gc {
289
+ GeneralCategory::LetterUppercase |
290
+ GeneralCategory::LetterLowercase |
291
+ GeneralCategory::LetterTitlecase |
292
+ GeneralCategory::LetterModifier |
293
+ GeneralCategory::LetterOther => GeneralCategoryGroup::Letter,
294
+ GeneralCategory::MarkNonspacing |
295
+ GeneralCategory::MarkSpacing |
296
+ GeneralCategory::MarkEnclosing => GeneralCategoryGroup::Mark,
297
+ GeneralCategory::NumberDecimal |
298
+ GeneralCategory::NumberLetter |
299
+ GeneralCategory::NumberOther => GeneralCategoryGroup::Number,
300
+ GeneralCategory::PunctuationConnector |
301
+ GeneralCategory::PunctuationDash |
302
+ GeneralCategory::PunctuationOpen |
303
+ GeneralCategory::PunctuationClose |
304
+ GeneralCategory::PunctuationInitial |
305
+ GeneralCategory::PunctuationFinal |
306
+ GeneralCategory::PunctuationOther => GeneralCategoryGroup::Punctuation,
307
+ GeneralCategory::SymbolMath |
308
+ GeneralCategory::SymbolCurrency |
309
+ GeneralCategory::SymbolModifier |
310
+ GeneralCategory::SymbolOther => GeneralCategoryGroup::Symbol,
311
+ GeneralCategory::SeparatorSpace |
312
+ GeneralCategory::SeparatorLine |
313
+ GeneralCategory::SeparatorParagraph => GeneralCategoryGroup::Separator,
314
+ GeneralCategory::OtherControl |
315
+ GeneralCategory::OtherFormat |
316
+ GeneralCategory::OtherSurrogate |
317
+ GeneralCategory::OtherPrivateUse |
318
+ GeneralCategory::OtherUnassigned => GeneralCategoryGroup::Other,
319
+ }
320
+ }
321
+ """ )
322
+ gc_variants = {
323
+ "Lu" : "GeneralCategory::LetterUppercase" ,
324
+ "Ll" : "GeneralCategory::LetterLowercase" ,
325
+ "Lt" : "GeneralCategory::LetterTitlecase" ,
326
+ "Lm" : "GeneralCategory::LetterModifier" ,
327
+ "Lo" : "GeneralCategory::LetterOther" ,
328
+ "Mn" : "GeneralCategory::MarkNonspacing" ,
329
+ "Mc" : "GeneralCategory::MarkSpacing" ,
330
+ "Me" : "GeneralCategory::MarkEnclosing" ,
331
+ "Nd" : "GeneralCategory::NumberDecimal" ,
332
+ "Nl" : "GeneralCategory::NumberLetter" ,
333
+ "No" : "GeneralCategory::NumberOther" ,
334
+ "Pc" : "GeneralCategory::PunctuationConnector" ,
335
+ "Pd" : "GeneralCategory::PunctuationDash" ,
336
+ "Ps" : "GeneralCategory::PunctuationOpen" ,
337
+ "Pe" : "GeneralCategory::PunctuationClose" ,
338
+ "Pi" : "GeneralCategory::PunctuationInitial" ,
339
+ "Pf" : "GeneralCategory::PunctuationFinal" ,
340
+ "Po" : "GeneralCategory::PunctuationOther" ,
341
+ "Sm" : "GeneralCategory::SymbolMath" ,
342
+ "Sc" : "GeneralCategory::SymbolCurrency" ,
343
+ "Sk" : "GeneralCategory::SymbolModifier" ,
344
+ "So" : "GeneralCategory::SymbolOther" ,
345
+ "Zs" : "GeneralCategory::SeparatorSpace" ,
346
+ "Zl" : "GeneralCategory::SeparatorLine" ,
347
+ "Zp" : "GeneralCategory::SeparatorParagraph" ,
348
+ "Cc" : "GeneralCategory::OtherControl" ,
349
+ "Cf" : "GeneralCategory::OtherFormat" ,
350
+ "Cs" : "GeneralCategory::OtherSurrogate" ,
351
+ "Co" : "GeneralCategory::OtherPrivateUse" ,
352
+ "Cn" : "GeneralCategory::OtherUnassigned" ,
353
+ }
354
+
355
+ f .write (" // General category table:\n " )
356
+ general_category_char_table = load_general_category_properties ("UnicodeData.txt" )
357
+ general_category_group_table = []
358
+ for input_idx in range (len (general_category_char_table )):
359
+ if general_category_char_table [input_idx ][2 ] == "Cs" :
360
+ continue
361
+ existing_group_count = len (general_category_group_table )
362
+ if existing_group_count == 0 :
363
+ general_category_group_table .append (general_category_char_table [input_idx ])
364
+ elif (general_category_group_table [existing_group_count - 1 ][1 ] + 1 == general_category_char_table [input_idx ][0 ] and
365
+ general_category_group_table [existing_group_count - 1 ][2 ] == general_category_char_table [input_idx ][2 ]):
366
+ general_category_group_table [existing_group_count - 1 ] = (general_category_group_table [existing_group_count - 1 ][0 ],
367
+ general_category_char_table [input_idx ][1 ], general_category_group_table [existing_group_count - 1 ][2 ])
368
+ else :
369
+ general_category_group_table .append (general_category_char_table [input_idx ])
370
+ emit_table (f , "GENERAL_CATEGORY" , general_category_group_table , "&'static [(char, char, GeneralCategory)]" , is_pub = False ,
371
+ pfun = lambda x : "(%s,%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), gc_variants [x [2 ]]))
372
+ f .write ("}\n \n " )
373
+
374
+
133
375
def emit_emoji_module (f ):
134
376
f .write ("""#[cfg(feature = \" emoji\" )]
135
377
pub mod emoji {""" )
136
378
f .write ("""
137
379
138
380
#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
139
- #[allow(non_camel_case_types)]
140
381
#[non_exhaustive]
141
382
pub enum EmojiStatus {
142
383
NonEmoji,
@@ -305,5 +546,7 @@ def emit_util_mod(f):
305
546
""" % UNICODE_VERSION )
306
547
307
548
emit_util_mod (rf )
549
+ ### general category module
550
+ emit_general_category_module (rf )
308
551
### emoji module
309
552
emit_emoji_module (rf )
0 commit comments