Merge pull request #70 from sunfishcode/ext

sujayakar · web-flow · commit 685a8cc965da · 2021-01-06T10:05:16.000-08:00
Add new normalization algorithms using Standardized Variants
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -14,9 +14,10 @@
 # - DerivedNormalizationProps.txt
 # - NormalizationTest.txt
 # - UnicodeData.txt
+# - StandardizedVariants.txt
 #
 # Since this should not require frequent updates, we just store this
-# out-of-line and check the unicode.rs file into git.
+# out-of-line and check the tables.rs and normalization_tests.rs files into git.
 import collections
 import urllib.request
 
@@ -57,6 +58,11 @@
     'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
 }
 
+# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
+# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
+S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
+S_COUNT = L_COUNT * V_COUNT * T_COUNT
+
 class UnicodeData(object):
     def __init__(self):
         self._load_unicode_data()
@@ -66,6 +72,9 @@ def __init__(self):
         self.canon_comp = self._compute_canonical_comp()
         self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
 
+        self.cjk_compat_variants_fully_decomp = {}
+        self._load_cjk_compat_ideograph_variants()
+
         def stats(name, table):
             count = sum(len(v) for v in table.values())
             print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
@@ -75,6 +84,7 @@ def stats(name, table):
         stats("Compatible decomp", self.compat_decomp)
         stats("Canonical fully decomp", self.canon_fully_decomp)
         stats("Compatible fully decomp", self.compat_fully_decomp)
+        stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp)
 
         self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
 
@@ -83,6 +93,7 @@ def _fetch(self, filename):
         return resp.read().decode('utf-8')
 
     def _load_unicode_data(self):
+        self.name_to_char_int = {}
         self.combining_classes = {}
         self.compat_decomp = {}
         self.canon_decomp = {}
@@ -95,6 +106,9 @@ def _load_unicode_data(self):
             char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
             char_int = int(char, 16)
 
+            name = pieces[1].strip()
+            self.name_to_char_int[name] = char_int
+
             if cc != '0':
                 self.combining_classes[char_int] = cc
 
@@ -106,6 +120,41 @@ def _load_unicode_data(self):
             if category == 'M' or 'M' in expanded_categories.get(category, []):
                 self.general_category_mark.append(char_int)
 
+    def _load_cjk_compat_ideograph_variants(self):
+        for line in self._fetch("StandardizedVariants.txt").splitlines():
+            strip_comments = line.split('#', 1)[0].strip()
+            if not strip_comments:
+                continue
+
+            variation_sequence, description, differences = strip_comments.split(';')
+            description = description.strip()
+
+            # Don't use variations that only apply in particular shaping environments.
+            if differences:
+                continue
+
+            # Look for entries where the description field is a codepoint name.
+            if description not in self.name_to_char_int:
+                continue
+
+            # Only consider the CJK Compatibility Ideographs.
+            if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
+                continue
+
+            char_int = self.name_to_char_int[description]
+
+            assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
+            assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
+            assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
+            # If we ever need to handle Hangul here, we'll need to handle it separately.
+            assert not (S_BASE <= char_int < S_BASE + S_COUNT)
+
+            cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
+            for c in cjk_compat_variant_parts:
+                assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
+                assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
+            self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
+
     def _load_norm_props(self):
         props = collections.defaultdict(list)
 
@@ -178,11 +227,6 @@ def _compute_fully_decomposed(self):
         The upshot is that decomposition code is very simple and easy to inline
         at mild code size cost.
         """
-        # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
-        # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
-        S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
-        S_COUNT = L_COUNT * V_COUNT * T_COUNT
-
         def _decompose(char_int, compatible):
             # 7-bit ASCII never decomposes
             if char_int <= 0x7f:
@@ -320,8 +364,8 @@ def gen_composition_table(canon_comp, out):
     out.write("    }\n")
     out.write("}\n")
 
-def gen_decomposition_tables(canon_decomp, compat_decomp, out):
-    tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')]
+def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
+    tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
     for table, name in tables:
         gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
             lambda k: "(0x{:x}, &[{}])".format(k,
@@ -491,7 +535,7 @@ def minimal_perfect_hash(d):
         gen_composition_table(data.canon_comp, out)
         out.write("\n")
 
-        gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, out)
+        gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
 
         gen_combining_mark(data.general_category_mark, out)
         out.write("\n")
diff --git a/src/lib.rs b/src/lib.rs
@@ -59,6 +59,7 @@ pub use crate::quick_check::{
     IsNormalized,
 };
 pub use crate::recompose::Recompositions;
+pub use crate::replace::Replacements;
 pub use crate::stream_safe::StreamSafe;
 pub use crate::tables::UNICODE_VERSION;
 use core::str::Chars;
@@ -71,6 +72,7 @@ mod normalize;
 mod perfect_hash;
 mod quick_check;
 mod recompose;
+mod replace;
 mod stream_safe;
 
 #[rustfmt::skip]
@@ -83,7 +85,9 @@ mod test;
 
 /// Methods for composing and decomposing characters.
 pub mod char {
-    pub use crate::normalize::{compose, decompose_canonical, decompose_compatible};
+    pub use crate::normalize::{
+        compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
+    };
 
     pub use crate::lookups::{canonical_combining_class, is_combining_mark};
 }
@@ -108,6 +112,18 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
     /// (compatibility decomposition followed by canonical composition).
     fn nfkc(self) -> Recompositions<I>;
 
+    /// A transformation which replaces CJK Compatibility Ideograph codepoints
+    /// with normal forms using Standardized Variation Sequences. This is not
+    /// part of the canonical or compatibility decomposition algorithms, but
+    /// performing it before those algorithms produces normalized output which
+    /// better preserves the intent of the original text.
+    ///
+    /// Note that many systems today ignore variation selectors, so these
+    /// may not immediately help text display as intended, but they at
+    /// least preserve the information in a standardized form, giving
+    /// implementations the option to recognize them.
+    fn cjk_compat_variants(self) -> Replacements<I>;
+
     /// An Iterator over the string with Conjoining Grapheme Joiner characters
     /// inserted according to the Stream-Safe Text Process (UAX15-D4)
     fn stream_safe(self) -> StreamSafe<I>;
@@ -134,6 +150,11 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
         recompose::new_compatible(self.chars())
     }
 
+    #[inline]
+    fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
+        replace::new_cjk_compat_variants(self.chars())
+    }
+
     #[inline]
     fn stream_safe(self) -> StreamSafe<Chars<'a>> {
         StreamSafe::new(self.chars())
@@ -161,6 +182,11 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
         recompose::new_compatible(self)
     }
 
+    #[inline]
+    fn cjk_compat_variants(self) -> Replacements<I> {
+        replace::new_cjk_compat_variants(self)
+    }
+
     #[inline]
     fn stream_safe(self) -> StreamSafe<I> {
         StreamSafe::new(self)
diff --git a/src/lookups.rs b/src/lookups.rs
@@ -64,6 +64,17 @@ pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]>
     )
 }
 
+pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
+    mph_lookup(
+        c.into(),
+        CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
+        CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
+        pair_lookup_fk,
+        pair_lookup_fv_opt,
+        None,
+    )
+}
+
 /// Return whether the given character is a combining mark (`General_Category=Mark`)
 pub fn is_combining_mark(c: char) -> bool {
     mph_lookup(
diff --git a/src/normalize.rs b/src/normalize.rs
@@ -10,7 +10,8 @@
 
 //! Functions for computing canonical and compatible decompositions for Unicode characters.
 use crate::lookups::{
-    canonical_fully_decomposed, compatibility_fully_decomposed, composition_table,
+    canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
+    compatibility_fully_decomposed, composition_table,
 };
 
 use core::{char, ops::FnMut};
@@ -36,6 +37,39 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
     decompose(c, decompose_char, emit_char)
 }
 
+/// Compute standard-variation decomposition for character.
+///
+/// [Standardized Variation Sequences] are used instead of the standard canonical
+/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
+/// to avoid losing information. See the
+/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
+/// "Other Enhancements" section of the
+/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
+/// for more information.
+#[inline]
+pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
+where
+    F: FnMut(char),
+{
+    // 7-bit ASCII never decomposes
+    if c <= '\x7f' {
+        emit_char(c);
+        return;
+    }
+
+    // Don't perform decomposition for Hangul
+
+    if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
+        for &d in decomposed {
+            emit_char(d);
+        }
+        return;
+    }
+
+    // Finally bottom out.
+    emit_char(c);
+}
+
 #[inline]
 fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
 where
diff --git a/src/replace.rs b/src/replace.rs
@@ -0,0 +1,61 @@
+// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+use core::fmt::{self, Write};
+use tinyvec::ArrayVec;
+
+/// External iterator for replacements for a string's characters.
+#[derive(Clone)]
+pub struct Replacements<I> {
+    iter: I,
+    // At this time, the longest replacement sequence has length 2, so we just
+    // need buffer space for 1 codepoint.
+    buffer: Option<char>,
+}
+
+#[inline]
+pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
+    Replacements { iter, buffer: None }
+}
+
+impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
+    type Item = char;
+
+    #[inline]
+    fn next(&mut self) -> Option<char> {
+        if let Some(c) = self.buffer.take() {
+            return Some(c);
+        }
+
+        match self.iter.next() {
+            Some(ch) => {
+                // At this time, the longest replacement sequence has length 2.
+                let mut buffer = ArrayVec::<[char; 2]>::new();
+                super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
+                self.buffer = buffer.get(1).copied();
+                Some(buffer[0])
+            }
+            None => None,
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let (lower, _) = self.iter.size_hint();
+        (lower, None)
+    }
+}
+
+impl<I: Iterator<Item = char> + Clone> fmt::Display for Replacements<I> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        for c in self.clone() {
+            f.write_char(c)?;
+        }
+        Ok(())
+    }
+}
diff --git a/src/tables.rs b/src/tables.rs
diff --git a/tests/cjk_compat_variants.rs b/tests/cjk_compat_variants.rs