Skip to content

Commit 949e39c

Browse files
committed
use is_alphanumeric from std
use is_alphanumeric from std and update unicode.py to remove obsolete parts closes #148
1 parent 9b1b7f9 commit 949e39c

File tree

4 files changed

+3159
-405
lines changed

4 files changed

+3159
-405
lines changed

scripts/unicode.py

Lines changed: 2 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -74,42 +74,6 @@ def fetch(f):
7474
sys.stderr.write("cannot load %s" % f)
7575
exit(1)
7676

77-
def load_gencats(f):
78-
fetch(f)
79-
gencats = {}
80-
81-
udict = {};
82-
range_start = -1;
83-
for line in fileinput.input(f):
84-
data = line.split(';');
85-
if len(data) != 15:
86-
continue
87-
cp = int(data[0], 16);
88-
if is_surrogate(cp):
89-
continue
90-
if range_start >= 0:
91-
for i in range(range_start, cp):
92-
udict[i] = data;
93-
range_start = -1;
94-
if data[1].endswith(", First>"):
95-
range_start = cp;
96-
continue;
97-
udict[cp] = data;
98-
99-
for code in udict:
100-
[code_org, name, gencat, combine, bidi,
101-
decomp, deci, digit, num, mirror,
102-
old, iso, upcase, lowcase, titlecase ] = udict[code];
103-
104-
# place letter in categories as appropriate
105-
for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
106-
if cat not in gencats:
107-
gencats[cat] = []
108-
gencats[cat].append(code)
109-
110-
gencats = group_cats(gencats)
111-
return gencats
112-
11377
def group_cats(cats):
11478
cats_out = {}
11579
for cat in cats:
@@ -230,36 +194,6 @@ def emit_util_mod(f):
230194
}).is_ok()
231195
}
232196
233-
#[inline]
234-
fn is_alphabetic(c: char) -> bool {
235-
if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION {
236-
c.is_alphabetic()
237-
} else {
238-
match c {
239-
'a' ..= 'z' | 'A' ..= 'Z' => true,
240-
c if c > '\\x7f' => super::derived_property::Alphabetic(c),
241-
_ => false,
242-
}
243-
}
244-
}
245-
246-
#[inline]
247-
fn is_numeric(c: char) -> bool {
248-
if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION {
249-
c.is_numeric()
250-
} else {
251-
match c {
252-
'0' ..= '9' => true,
253-
c if c > '\\x7f' => super::general_category::N(c),
254-
_ => false,
255-
}
256-
}
257-
}
258-
259-
#[inline]
260-
pub fn is_alphanumeric(c: char) -> bool {
261-
is_alphabetic(c) || is_numeric(c)
262-
}
263197
}
264198
265199
""")
@@ -396,20 +330,13 @@ def emit_break_module(f, break_table, break_cats, name):
396330
/// The version of [Unicode](http://www.unicode.org/)
397331
/// that this version of unicode-segmentation is based on.
398332
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
399-
""" % UNICODE_VERSION)
400-
401-
rf.write("""
402-
const UNICODE_VERSION_U8: (u8, u8, u8) = (%s, %s, %s);
403333
""" % UNICODE_VERSION)
404334

405335
# download and parse all the data
406-
gencats = load_gencats("UnicodeData.txt")
407-
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic", ("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])
336+
derived = load_properties("DerivedCoreProperties.txt", [("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])
408337

409338
emit_util_mod(rf)
410-
for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
411-
("derived_property", derived, ["Alphabetic", ("InCB", "Extend")]):
412-
emit_property_module(rf, name, cat, pfuns)
339+
emit_property_module(rf, "derived_property", derived, [("InCB", "Extend")])
413340

414341
rf.write("""pub fn is_incb_linker(c: char) -> bool {
415342
matches!(c,""")

src/sentence.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,10 +345,9 @@ pub fn new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_> {
345345
#[inline]
346346
pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> {
347347
use super::UnicodeSegmentation;
348-
use crate::tables::util::is_alphanumeric;
349348

350349
fn has_alphanumeric(s: &&str) -> bool {
351-
s.chars().any(is_alphanumeric)
350+
s.chars().any(|c| c.is_alphanumeric())
352351
}
353352
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
354353

0 commit comments

Comments
 (0)