Skip to content

Commit d7257af

Browse files
committed
Implement general category retrieval.
1 parent dfb02a6 commit d7257af

File tree

4 files changed

+2955
-3
lines changed

4 files changed

+2955
-3
lines changed

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ version = "0.1.0"
44
edition = "2021"
55

66
[features]
7+
general-category = []
78
emoji = []
8-
default = ["emoji"]
9+
default = ["general-category", "emoji"]
910

1011
[dependencies]

scripts/unicode.py

Lines changed: 244 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,60 @@ def load_emoji_properties(f):
8080

8181
return kinds
8282

83+
84+
def load_general_category_properties(f):
85+
fetch_unidata(f)
86+
general_category_list = []
87+
re1 = re.compile(r"^([0-9A-F]+);([^;]+);([A-Za-z]+);.*$")
88+
re2 = re.compile(r"^<(.*), First>$")
89+
re3 = re.compile(r"^<(.*), Last>$")
90+
re4 = re.compile(r"^<(.*)>$")
91+
92+
special_group_lo = 0
93+
special_group_text = ''
94+
special_group_gc = ''
95+
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
96+
d_ch = 0
97+
d_name = ''
98+
d_gc = ''
99+
d_lo = 0
100+
d_hi = 0
101+
m = re1.match(line)
102+
if not m:
103+
continue
104+
105+
d_ch = m.group(1)
106+
d_name = m.group(2).strip()
107+
d_gc = m.group(3).strip()
108+
109+
if not d_name.startswith('<'):
110+
d_lo = int(d_ch, 16)
111+
d_hi = d_lo
112+
general_category_list.append((d_lo, d_hi, d_gc))
113+
continue
114+
m2 = re2.match(d_name)
115+
if m2:
116+
special_group_lo = int(d_ch, 16)
117+
special_group_text = m2.group(1)
118+
special_group_gc = d_gc
119+
continue
120+
m3 = re3.match(d_name)
121+
if m3:
122+
assert(special_group_text == m3.group(1))
123+
assert(special_group_gc == d_gc)
124+
d_lo = special_group_lo
125+
d_hi = int(d_ch, 16)
126+
general_category_list.append((d_lo, d_hi, d_gc))
127+
continue
128+
m4 = re4.match(d_name)
129+
if m4:
130+
d_lo = int(d_ch, 16)
131+
d_hi = d_lo
132+
general_category_list.append((d_lo, d_hi, d_gc))
133+
continue
134+
raise ValueError("unreachable")
135+
return general_category_list
136+
83137
def format_table_content(f, content, indent):
84138
line = " "*indent
85139
first = True
@@ -130,13 +184,200 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
130184
format_table_content(f, data, 8)
131185
f.write("\n ];\n\n")
132186

187+
def emit_general_category_module(f):
188+
f.write("""#[cfg(feature = \"general-category\")]
189+
pub mod general_category {""")
190+
f.write("""
191+
192+
#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
193+
pub enum GeneralCategory {
194+
/// an uppercase letter
195+
LetterUppercase,
196+
/// a lowercase letter
197+
LetterLowercase,
198+
/// a digraphic character, with first part uppercase
199+
LetterTitlecase,
200+
/// a modifier letter
201+
LetterModifier,
202+
/// other letters, including syllables and ideographs
203+
LetterOther,
204+
/// a nonspacing combining mark (zero advance width)
205+
MarkNonspacing,
206+
/// a spacing combining mark (positive advance width)
207+
MarkSpacing,
208+
/// an enclosing combining mark
209+
MarkEnclosing,
210+
/// a decimal digit
211+
NumberDecimal,
212+
/// a letterlike numeric character
213+
NumberLetter,
214+
/// a numeric character of other type
215+
NumberOther,
216+
/// a connecting punctuation mark, like a tie
217+
PunctuationConnector,
218+
/// a dash or hyphen punctuation mark
219+
PunctuationDash,
220+
/// an opening punctuation mark (of a pair)
221+
PunctuationOpen,
222+
/// a closing punctuation mark (of a pair)
223+
PunctuationClose,
224+
/// an initial quotation mark
225+
PunctuationInitial,
226+
/// a final quotation mark
227+
PunctuationFinal,
228+
/// a punctuation mark of other type
229+
PunctuationOther,
230+
/// a symbol of mathematical use
231+
SymbolMath,
232+
/// a currency sign
233+
SymbolCurrency,
234+
/// a non-letterlike modifier symbol
235+
SymbolModifier,
236+
/// a symbol of other type
237+
SymbolOther,
238+
/// a space character (of various non-zero widths)
239+
SeparatorSpace,
240+
/// U+2028 LINE SEPARATOR only
241+
SeparatorLine,
242+
/// U+2029 PARAGRAPH SEPARATOR only
243+
SeparatorParagraph,
244+
/// a C0 or C1 control code
245+
OtherControl,
246+
/// a format control character
247+
OtherFormat,
248+
/// a surrogate code point
249+
OtherSurrogate,
250+
/// a private-use character
251+
OtherPrivateUse,
252+
/// a reserved unassigned code point or a noncharacter
253+
OtherUnassigned,
254+
}
255+
256+
#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
257+
pub enum GeneralCategoryGroup {
258+
/// Lu | Ll | Lt | Lm | Lo
259+
Letter,
260+
/// Mn | Mc | Me
261+
Mark,
262+
/// Nd | Nl | No
263+
Number,
264+
/// Pc | Pd | Ps | Pe | Pi | Pf | Po
265+
Punctuation,
266+
/// Sm | Sc | Sk | So
267+
Symbol,
268+
/// Zs | Zl | Zp
269+
Separator,
270+
/// Cc | Cf | Cs | Co | Cn
271+
Other,
272+
}
273+
274+
#[inline]
275+
pub(crate) fn general_category_of_char(c: char) -> GeneralCategory {
276+
match c as usize {
277+
_ => super::util::bsearch_range_value_table(c, GENERAL_CATEGORY).unwrap_or(GeneralCategory::OtherUnassigned)
278+
}
279+
}
280+
281+
#[inline]
282+
pub(crate) fn general_category_is_letter_cased(gc: GeneralCategory) -> bool {
283+
matches!(gc, GeneralCategory::LetterUppercase | GeneralCategory::LetterLowercase | GeneralCategory::LetterTitlecase)
284+
}
285+
286+
#[inline]
287+
pub(crate) fn general_category_group(gc: GeneralCategory) -> GeneralCategoryGroup {
288+
match gc {
289+
GeneralCategory::LetterUppercase |
290+
GeneralCategory::LetterLowercase |
291+
GeneralCategory::LetterTitlecase |
292+
GeneralCategory::LetterModifier |
293+
GeneralCategory::LetterOther => GeneralCategoryGroup::Letter,
294+
GeneralCategory::MarkNonspacing |
295+
GeneralCategory::MarkSpacing |
296+
GeneralCategory::MarkEnclosing => GeneralCategoryGroup::Mark,
297+
GeneralCategory::NumberDecimal |
298+
GeneralCategory::NumberLetter |
299+
GeneralCategory::NumberOther => GeneralCategoryGroup::Number,
300+
GeneralCategory::PunctuationConnector |
301+
GeneralCategory::PunctuationDash |
302+
GeneralCategory::PunctuationOpen |
303+
GeneralCategory::PunctuationClose |
304+
GeneralCategory::PunctuationInitial |
305+
GeneralCategory::PunctuationFinal |
306+
GeneralCategory::PunctuationOther => GeneralCategoryGroup::Punctuation,
307+
GeneralCategory::SymbolMath |
308+
GeneralCategory::SymbolCurrency |
309+
GeneralCategory::SymbolModifier |
310+
GeneralCategory::SymbolOther => GeneralCategoryGroup::Symbol,
311+
GeneralCategory::SeparatorSpace |
312+
GeneralCategory::SeparatorLine |
313+
GeneralCategory::SeparatorParagraph => GeneralCategoryGroup::Separator,
314+
GeneralCategory::OtherControl |
315+
GeneralCategory::OtherFormat |
316+
GeneralCategory::OtherSurrogate |
317+
GeneralCategory::OtherPrivateUse |
318+
GeneralCategory::OtherUnassigned => GeneralCategoryGroup::Other,
319+
}
320+
}
321+
""")
322+
gc_variants = {
323+
"Lu": "GeneralCategory::LetterUppercase",
324+
"Ll": "GeneralCategory::LetterLowercase" ,
325+
"Lt": "GeneralCategory::LetterTitlecase" ,
326+
"Lm": "GeneralCategory::LetterModifier" ,
327+
"Lo": "GeneralCategory::LetterOther",
328+
"Mn": "GeneralCategory::MarkNonspacing",
329+
"Mc": "GeneralCategory::MarkSpacing" ,
330+
"Me": "GeneralCategory::MarkEnclosing",
331+
"Nd": "GeneralCategory::NumberDecimal",
332+
"Nl": "GeneralCategory::NumberLetter" ,
333+
"No": "GeneralCategory::NumberOther",
334+
"Pc": "GeneralCategory::PunctuationConnector",
335+
"Pd": "GeneralCategory::PunctuationDash" ,
336+
"Ps": "GeneralCategory::PunctuationOpen" ,
337+
"Pe": "GeneralCategory::PunctuationClose" ,
338+
"Pi": "GeneralCategory::PunctuationInitial" ,
339+
"Pf": "GeneralCategory::PunctuationFinal" ,
340+
"Po": "GeneralCategory::PunctuationOther",
341+
"Sm": "GeneralCategory::SymbolMath",
342+
"Sc": "GeneralCategory::SymbolCurrency" ,
343+
"Sk": "GeneralCategory::SymbolModifier" ,
344+
"So": "GeneralCategory::SymbolOther",
345+
"Zs": "GeneralCategory::SeparatorSpace",
346+
"Zl": "GeneralCategory::SeparatorLine" ,
347+
"Zp": "GeneralCategory::SeparatorParagraph",
348+
"Cc": "GeneralCategory::OtherControl",
349+
"Cf": "GeneralCategory::OtherFormat" ,
350+
"Cs": "GeneralCategory::OtherSurrogate" ,
351+
"Co": "GeneralCategory::OtherPrivateUse" ,
352+
"Cn": "GeneralCategory::OtherUnassigned",
353+
}
354+
355+
f.write(" // General category table:\n")
356+
general_category_char_table = load_general_category_properties("UnicodeData.txt")
357+
general_category_group_table = []
358+
for input_idx in range(len(general_category_char_table)):
359+
if general_category_char_table[input_idx][2] == "Cs":
360+
continue
361+
existing_group_count = len(general_category_group_table)
362+
if existing_group_count == 0:
363+
general_category_group_table.append(general_category_char_table[input_idx])
364+
elif (general_category_group_table[existing_group_count - 1][1] + 1 == general_category_char_table[input_idx][0] and
365+
general_category_group_table[existing_group_count - 1][2] == general_category_char_table[input_idx][2]):
366+
general_category_group_table[existing_group_count - 1] = (general_category_group_table[existing_group_count - 1][0],
367+
general_category_char_table[input_idx][1], general_category_group_table[existing_group_count - 1][2])
368+
else:
369+
general_category_group_table.append(general_category_char_table[input_idx])
370+
emit_table(f, "GENERAL_CATEGORY", general_category_group_table, "&'static [(char, char, GeneralCategory)]", is_pub=False,
371+
pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), gc_variants[x[2]]))
372+
f.write("}\n\n")
373+
374+
133375
def emit_emoji_module(f):
134376
f.write("""#[cfg(feature = \"emoji\")]
135377
pub mod emoji {""")
136378
f.write("""
137379
138380
#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
139-
#[allow(non_camel_case_types)]
140381
#[non_exhaustive]
141382
pub enum EmojiStatus {
142383
NonEmoji,
@@ -305,5 +546,7 @@ def emit_util_mod(f):
305546
""" % UNICODE_VERSION)
306547

307548
emit_util_mod(rf)
549+
### general category module
550+
emit_general_category_module(rf)
308551
### emoji module
309552
emit_emoji_module(rf)

src/lib.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,44 @@ mod emoji {
2424
}
2525
}
2626

27+
#[cfg(feature = "general-category")]
28+
mod general_category {
29+
pub use crate::tables::general_category::{GeneralCategory, GeneralCategoryGroup};
30+
31+
pub trait UnicodeGeneralCategory: Sized {
32+
fn general_category(self) -> GeneralCategory;
33+
34+
fn general_category_group(self) -> GeneralCategoryGroup {
35+
crate::tables::general_category::general_category_group(self.general_category())
36+
}
37+
38+
fn is_letter_cased(self) -> bool {
39+
crate::tables::general_category::general_category_is_letter_cased(
40+
self.general_category(),
41+
)
42+
}
43+
}
44+
45+
impl UnicodeGeneralCategory for char {
46+
fn general_category(self) -> GeneralCategory {
47+
crate::tables::general_category::general_category_of_char(self)
48+
}
49+
}
50+
}
51+
2752
pub use tables::UNICODE_VERSION;
2853

2954
#[cfg(feature = "emoji")]
3055
pub use emoji::UnicodeEmoji;
3156

3257
#[cfg(feature = "emoji")]
3358
pub use emoji::EmojiStatus;
59+
60+
#[cfg(feature = "general-category")]
61+
pub use general_category::GeneralCategory;
62+
63+
#[cfg(feature = "general-category")]
64+
pub use general_category::GeneralCategoryGroup;
65+
66+
#[cfg(feature = "general-category")]
67+
pub use general_category::UnicodeGeneralCategory;

0 commit comments

Comments
 (0)