unicode-rs · PSeitz · Jul 14, 2025
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -74,42 +74,6 @@ def fetch(f):
         sys.stderr.write("cannot load %s" % f)
         exit(1)
 
-def load_gencats(f):
-    fetch(f)
-    gencats = {}
-
-    udict = {};
-    range_start = -1;
-    for line in fileinput.input(f):
-        data = line.split(';');
-        if len(data) != 15:
-            continue
-        cp = int(data[0], 16);
-        if is_surrogate(cp):
-            continue
-        if range_start >= 0:
-            for i in range(range_start, cp):
-                udict[i] = data;
-            range_start = -1;
-        if data[1].endswith(", First>"):
-            range_start = cp;
-            continue;
-        udict[cp] = data;
-
-    for code in udict:
-        [code_org, name, gencat, combine, bidi,
-         decomp, deci, digit, num, mirror,
-         old, iso, upcase, lowcase, titlecase ] = udict[code];
-
-        # place letter in categories as appropriate
-        for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
-            if cat not in gencats:
-                gencats[cat] = []
-            gencats[cat].append(code)
-
-    gencats = group_cats(gencats)
-    return gencats
-
 def group_cats(cats):
     cats_out = {}
     for cat in cats:
@@ -230,36 +194,6 @@ def emit_util_mod(f):
         }).is_ok()
     }
 
-    #[inline]
-    fn is_alphabetic(c: char) -> bool {
-        if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION {
-            c.is_alphabetic()
-        } else {
-            match c {
-                'a' ..= 'z' | 'A' ..= 'Z' => true,
-                c if c > '\\x7f' => super::derived_property::Alphabetic(c),
-                _ => false,
-            }
-        }
-    }
-
-    #[inline]
-    fn is_numeric(c: char) -> bool {
-        if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION {
-            c.is_numeric()
-        } else {
-            match c {
-                '0' ..= '9' => true,
-                c if c > '\\x7f' => super::general_category::N(c),
-                _ => false,
-            }
-        }
-    }
-
-    #[inline]
-    pub fn is_alphanumeric(c: char) -> bool {
-        is_alphabetic(c) || is_numeric(c)
-    }
 }
 
 """)
@@ -396,20 +330,13 @@ def emit_break_module(f, break_table, break_cats, name):
 /// The version of [Unicode](http://www.unicode.org/)
 /// that this version of unicode-segmentation is based on.
 pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
-""" % UNICODE_VERSION)
-
-        rf.write("""
-const UNICODE_VERSION_U8: (u8, u8, u8) = (%s, %s, %s);
 """ % UNICODE_VERSION)
 
         # download and parse all the data
-        gencats = load_gencats("UnicodeData.txt")
-        derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic", ("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])
+        derived = load_properties("DerivedCoreProperties.txt", [("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])
 
         emit_util_mod(rf)
-        for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
-                                  ("derived_property", derived, ["Alphabetic", ("InCB", "Extend")]):
-            emit_property_module(rf, name, cat, pfuns)
+        emit_property_module(rf, "derived_property", derived, [("InCB", "Extend")])
 
         rf.write("""pub fn is_incb_linker(c: char) -> bool {
     matches!(c,""")

diff --git a/src/sentence.rs b/src/sentence.rs
@@ -345,10 +345,9 @@ pub fn new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_> {
 #[inline]
 pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> {
     use super::UnicodeSegmentation;
-    use crate::tables::util::is_alphanumeric;
 
     fn has_alphanumeric(s: &&str) -> bool {
-        s.chars().any(is_alphanumeric)
+        s.chars().any(|c| c.is_alphanumeric())
     }
     let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer