Skip to content

Commit 685a8cc

Browse files
authored
Merge pull request #70 from sunfishcode/ext
Add new normalization algorithms using Standardized Variants
2 parents 2f400a9 + 0d31e1e commit 685a8cc

File tree

7 files changed

+2276
-11
lines changed

7 files changed

+2276
-11
lines changed

scripts/unicode.py

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@
1414
# - DerivedNormalizationProps.txt
1515
# - NormalizationTest.txt
1616
# - UnicodeData.txt
17+
# - StandardizedVariants.txt
1718
#
1819
# Since this should not require frequent updates, we just store this
19-
# out-of-line and check the unicode.rs file into git.
20+
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
2021
import collections
2122
import urllib.request
2223

@@ -57,6 +58,11 @@
5758
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
5859
}
5960

61+
# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
62+
# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
63+
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
64+
S_COUNT = L_COUNT * V_COUNT * T_COUNT
65+
6066
class UnicodeData(object):
6167
def __init__(self):
6268
self._load_unicode_data()
@@ -66,6 +72,9 @@ def __init__(self):
6672
self.canon_comp = self._compute_canonical_comp()
6773
self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
6874

75+
self.cjk_compat_variants_fully_decomp = {}
76+
self._load_cjk_compat_ideograph_variants()
77+
6978
def stats(name, table):
7079
count = sum(len(v) for v in table.values())
7180
print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
@@ -75,6 +84,7 @@ def stats(name, table):
7584
stats("Compatible decomp", self.compat_decomp)
7685
stats("Canonical fully decomp", self.canon_fully_decomp)
7786
stats("Compatible fully decomp", self.compat_fully_decomp)
87+
stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp)
7888

7989
self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
8090

@@ -83,6 +93,7 @@ def _fetch(self, filename):
8393
return resp.read().decode('utf-8')
8494

8595
def _load_unicode_data(self):
96+
self.name_to_char_int = {}
8697
self.combining_classes = {}
8798
self.compat_decomp = {}
8899
self.canon_decomp = {}
@@ -95,6 +106,9 @@ def _load_unicode_data(self):
95106
char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
96107
char_int = int(char, 16)
97108

109+
name = pieces[1].strip()
110+
self.name_to_char_int[name] = char_int
111+
98112
if cc != '0':
99113
self.combining_classes[char_int] = cc
100114

@@ -106,6 +120,41 @@ def _load_unicode_data(self):
106120
if category == 'M' or 'M' in expanded_categories.get(category, []):
107121
self.general_category_mark.append(char_int)
108122

123+
def _load_cjk_compat_ideograph_variants(self):
124+
for line in self._fetch("StandardizedVariants.txt").splitlines():
125+
strip_comments = line.split('#', 1)[0].strip()
126+
if not strip_comments:
127+
continue
128+
129+
variation_sequence, description, differences = strip_comments.split(';')
130+
description = description.strip()
131+
132+
# Don't use variations that only apply in particular shaping environments.
133+
if differences:
134+
continue
135+
136+
# Look for entries where the description field is a codepoint name.
137+
if description not in self.name_to_char_int:
138+
continue
139+
140+
# Only consider the CJK Compatibility Ideographs.
141+
if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
142+
continue
143+
144+
char_int = self.name_to_char_int[description]
145+
146+
assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
147+
assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
148+
assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
149+
# If we ever need to handle Hangul here, we'll need to handle it separately.
150+
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
151+
152+
cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
153+
for c in cjk_compat_variant_parts:
154+
assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
155+
assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
156+
self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
157+
109158
def _load_norm_props(self):
110159
props = collections.defaultdict(list)
111160

@@ -178,11 +227,6 @@ def _compute_fully_decomposed(self):
178227
The upshot is that decomposition code is very simple and easy to inline
179228
at mild code size cost.
180229
"""
181-
# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
182-
# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
183-
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
184-
S_COUNT = L_COUNT * V_COUNT * T_COUNT
185-
186230
def _decompose(char_int, compatible):
187231
# 7-bit ASCII never decomposes
188232
if char_int <= 0x7f:
@@ -320,8 +364,8 @@ def gen_composition_table(canon_comp, out):
320364
out.write(" }\n")
321365
out.write("}\n")
322366

323-
def gen_decomposition_tables(canon_decomp, compat_decomp, out):
324-
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')]
367+
def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
368+
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
325369
for table, name in tables:
326370
gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
327371
lambda k: "(0x{:x}, &[{}])".format(k,
@@ -491,7 +535,7 @@ def minimal_perfect_hash(d):
491535
gen_composition_table(data.canon_comp, out)
492536
out.write("\n")
493537

494-
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, out)
538+
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
495539

496540
gen_combining_mark(data.general_category_mark, out)
497541
out.write("\n")

src/lib.rs

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ pub use crate::quick_check::{
5959
IsNormalized,
6060
};
6161
pub use crate::recompose::Recompositions;
62+
pub use crate::replace::Replacements;
6263
pub use crate::stream_safe::StreamSafe;
6364
pub use crate::tables::UNICODE_VERSION;
6465
use core::str::Chars;
@@ -71,6 +72,7 @@ mod normalize;
7172
mod perfect_hash;
7273
mod quick_check;
7374
mod recompose;
75+
mod replace;
7476
mod stream_safe;
7577

7678
#[rustfmt::skip]
@@ -83,7 +85,9 @@ mod test;
8385

8486
/// Methods for composing and decomposing characters.
8587
pub mod char {
86-
pub use crate::normalize::{compose, decompose_canonical, decompose_compatible};
88+
pub use crate::normalize::{
89+
compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
90+
};
8791

8892
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
8993
}
@@ -108,6 +112,18 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
108112
/// (compatibility decomposition followed by canonical composition).
109113
fn nfkc(self) -> Recompositions<I>;
110114

115+
/// A transformation which replaces CJK Compatibility Ideograph codepoints
116+
/// with normal forms using Standardized Variation Sequences. This is not
117+
/// part of the canonical or compatibility decomposition algorithms, but
118+
/// performing it before those algorithms produces normalized output which
119+
/// better preserves the intent of the original text.
120+
///
121+
/// Note that many systems today ignore variation selectors, so these
122+
/// may not immediately help text display as intended, but they at
123+
/// least preserve the information in a standardized form, giving
124+
/// implementations the option to recognize them.
125+
fn cjk_compat_variants(self) -> Replacements<I>;
126+
111127
/// An Iterator over the string with Conjoining Grapheme Joiner characters
112128
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
113129
fn stream_safe(self) -> StreamSafe<I>;
@@ -134,6 +150,11 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
134150
recompose::new_compatible(self.chars())
135151
}
136152

153+
#[inline]
154+
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
155+
replace::new_cjk_compat_variants(self.chars())
156+
}
157+
137158
#[inline]
138159
fn stream_safe(self) -> StreamSafe<Chars<'a>> {
139160
StreamSafe::new(self.chars())
@@ -161,6 +182,11 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
161182
recompose::new_compatible(self)
162183
}
163184

185+
#[inline]
186+
fn cjk_compat_variants(self) -> Replacements<I> {
187+
replace::new_cjk_compat_variants(self)
188+
}
189+
164190
#[inline]
165191
fn stream_safe(self) -> StreamSafe<I> {
166192
StreamSafe::new(self)

src/lookups.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,17 @@ pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]>
6464
)
6565
}
6666

67+
pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
68+
mph_lookup(
69+
c.into(),
70+
CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
71+
CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
72+
pair_lookup_fk,
73+
pair_lookup_fv_opt,
74+
None,
75+
)
76+
}
77+
6778
/// Return whether the given character is a combining mark (`General_Category=Mark`)
6879
pub fn is_combining_mark(c: char) -> bool {
6980
mph_lookup(

src/normalize.rs

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010

1111
//! Functions for computing canonical and compatible decompositions for Unicode characters.
1212
use crate::lookups::{
13-
canonical_fully_decomposed, compatibility_fully_decomposed, composition_table,
13+
canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
14+
compatibility_fully_decomposed, composition_table,
1415
};
1516

1617
use core::{char, ops::FnMut};
@@ -36,6 +37,39 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
3637
decompose(c, decompose_char, emit_char)
3738
}
3839

40+
/// Compute standard-variation decomposition for character.
41+
///
42+
/// [Standardized Variation Sequences] are used instead of the standard canonical
43+
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
44+
/// to avoid losing information. See the
45+
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
46+
/// "Other Enhancements" section of the
47+
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
48+
/// for more information.
49+
#[inline]
50+
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
51+
where
52+
F: FnMut(char),
53+
{
54+
// 7-bit ASCII never decomposes
55+
if c <= '\x7f' {
56+
emit_char(c);
57+
return;
58+
}
59+
60+
// Don't perform decomposition for Hangul
61+
62+
if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
63+
for &d in decomposed {
64+
emit_char(d);
65+
}
66+
return;
67+
}
68+
69+
// Finally bottom out.
70+
emit_char(c);
71+
}
72+
3973
#[inline]
4074
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
4175
where

src/replace.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
use core::fmt::{self, Write};
11+
use tinyvec::ArrayVec;
12+
13+
/// External iterator for replacements for a string's characters.
14+
#[derive(Clone)]
15+
pub struct Replacements<I> {
16+
iter: I,
17+
// At this time, the longest replacement sequence has length 2, so we just
18+
// need buffer space for 1 codepoint.
19+
buffer: Option<char>,
20+
}
21+
22+
#[inline]
23+
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
24+
Replacements { iter, buffer: None }
25+
}
26+
27+
impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
28+
type Item = char;
29+
30+
#[inline]
31+
fn next(&mut self) -> Option<char> {
32+
if let Some(c) = self.buffer.take() {
33+
return Some(c);
34+
}
35+
36+
match self.iter.next() {
37+
Some(ch) => {
38+
// At this time, the longest replacement sequence has length 2.
39+
let mut buffer = ArrayVec::<[char; 2]>::new();
40+
super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
41+
self.buffer = buffer.get(1).copied();
42+
Some(buffer[0])
43+
}
44+
None => None,
45+
}
46+
}
47+
48+
fn size_hint(&self) -> (usize, Option<usize>) {
49+
let (lower, _) = self.iter.size_hint();
50+
(lower, None)
51+
}
52+
}
53+
54+
impl<I: Iterator<Item = char> + Clone> fmt::Display for Replacements<I> {
55+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
56+
for c in self.clone() {
57+
f.write_char(c)?;
58+
}
59+
Ok(())
60+
}
61+
}

0 commit comments

Comments
 (0)