Skip to content

Commit 7609bbe

Browse files
committed
Add alignment tracking
1 parent e5a8ac8 commit 7609bbe

File tree

5 files changed

+463
-6
lines changed

5 files changed

+463
-6
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
name = "unicode-normalization"
44
version = "0.1.22"
5-
authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>"]
5+
authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>", "Anthony MOI <m.anthony.moi@gmail.com>"]
66

77
homepage = "https://github.com/unicode-rs/unicode-normalization"
88
repository = "https://github.com/unicode-rs/unicode-normalization"

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ Unicode character composition and decomposition utilities
77
as described in
88
[Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
99

10+
Including alignment information tracking from <https://github.com/n1t0/unicode-normalization> in separate traits. 100% compatible with upstream crate.
11+
1012
This crate requires Rust 1.36+.
1113

1214
```rust

src/decompose_alignment.rs

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
use core::fmt::{self, Write};
11+
use core::iter::Fuse;
12+
use core::ops::Range;
13+
use smallvec::SmallVec;
14+
15+
#[derive(Clone)]
16+
enum DecompositionType {
17+
Canonical,
18+
Compatible,
19+
}
20+
21+
/// External iterator for a string decomposition's characters.
22+
#[derive(Clone)]
23+
pub struct DecompositionsAlignment<I> {
24+
kind: DecompositionType,
25+
iter: Fuse<I>,
26+
27+
// This buffer stores pairs of (canonical combining class, character),
28+
// pushed onto the end in text order.
29+
//
30+
// It's divided into up to three sections:
31+
// 1) A prefix that is free space;
32+
// 2) "Ready" characters which are sorted and ready to emit on demand;
33+
// 3) A "pending" block which stills needs more characters for us to be able
34+
// to sort in canonical order and is not safe to emit.
35+
buffer: SmallVec<[(u8, char, isize); 4]>,
36+
ready: Range<usize>,
37+
}
38+
39+
#[inline]
40+
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> DecompositionsAlignment<I> {
41+
DecompositionsAlignment {
42+
kind: self::DecompositionType::Canonical,
43+
iter: iter.fuse(),
44+
buffer: Default::default(),
45+
ready: 0..0,
46+
}
47+
}
48+
49+
#[inline]
50+
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> DecompositionsAlignment<I> {
51+
DecompositionsAlignment {
52+
kind: self::DecompositionType::Compatible,
53+
iter: iter.fuse(),
54+
buffer: Default::default(),
55+
ready: 0..0,
56+
}
57+
}
58+
59+
impl<I> DecompositionsAlignment<I> {
60+
#[inline]
61+
fn push_back(&mut self, ch: char, first: bool) {
62+
let class = super::char::canonical_combining_class(ch);
63+
64+
if class == 0 {
65+
self.sort_pending();
66+
self.buffer.push((class, ch, if first { 0 } else { 1 }));
67+
self.ready.end = self.buffer.len();
68+
} else {
69+
self.buffer.push((class, ch, if first { 0 } else { 1 }));
70+
}
71+
}
72+
73+
#[inline]
74+
fn sort_pending(&mut self) {
75+
// NB: `sort_by_key` is stable, so it will preserve the original text's
76+
// order within a combining class.
77+
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
78+
}
79+
80+
#[inline]
81+
fn reset_buffer(&mut self) {
82+
// Equivalent to `self.buffer.drain(0..self.ready.end)`
83+
// but faster than drain() if the buffer is a SmallVec or TinyVec
84+
let pending = self.buffer.len() - self.ready.end;
85+
for i in 0..pending {
86+
self.buffer[i] = self.buffer[i + self.ready.end];
87+
}
88+
self.buffer.truncate(pending);
89+
self.ready = 0..0;
90+
}
91+
92+
#[inline]
93+
fn increment_next_ready(&mut self) {
94+
let next = self.ready.start + 1;
95+
if next == self.ready.end {
96+
self.reset_buffer();
97+
} else {
98+
self.ready.start = next;
99+
}
100+
}
101+
}
102+
103+
impl<I: Iterator<Item = char>> Iterator for DecompositionsAlignment<I> {
104+
type Item = (char, isize);
105+
106+
#[inline]
107+
fn next(&mut self) -> Option<Self::Item> {
108+
while self.ready.end == 0 {
109+
match (self.iter.next(), &self.kind) {
110+
(Some(ch), &DecompositionType::Canonical) => {
111+
let mut first = true;
112+
super::char::decompose_canonical(ch, |d| {
113+
self.push_back(d, first);
114+
first = false;
115+
});
116+
}
117+
(Some(ch), &DecompositionType::Compatible) => {
118+
let mut first = true;
119+
super::char::decompose_compatible(ch, |d| {
120+
self.push_back(d, first);
121+
first = false;
122+
});
123+
}
124+
(None, _) => {
125+
if self.buffer.is_empty() {
126+
return None;
127+
}
128+
self.sort_pending();
129+
self.ready.end = self.buffer.len();
130+
131+
// This implementation means that we can call `next`
132+
// on an exhausted iterator; the last outer `next` call
133+
// will result in an inner `next` call. To make this
134+
// safe, we use `fuse`.
135+
break;
136+
}
137+
}
138+
}
139+
140+
// We can assume here that, if `self.ready.end` is greater than zero,
141+
// it's also greater than `self.ready.start`. That's because we only
142+
// increment `self.ready.start` inside `increment_next_ready`, and
143+
// whenever it reaches equality with `self.ready.end`, we reset both
144+
// to zero, maintaining the invariant that:
145+
// self.ready.start < self.ready.end || self.ready.end == self.ready.start == 0
146+
//
147+
// This less-than-obviously-safe implementation is chosen for performance,
148+
// minimizing the number & complexity of branches in `next` in the common
149+
// case of buffering then unbuffering a single character with each call.
150+
let (_, ch, size) = self.buffer[self.ready.start];
151+
self.increment_next_ready();
152+
Some((ch, size))
153+
}
154+
155+
fn size_hint(&self) -> (usize, Option<usize>) {
156+
let (lower, _) = self.iter.size_hint();
157+
(lower, None)
158+
}
159+
}
160+
161+
impl<I: Iterator<Item = char> + Clone> fmt::Display for DecompositionsAlignment<I> {
162+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
163+
for (c, _) in self.clone() {
164+
f.write_char(c)?;
165+
}
166+
Ok(())
167+
}
168+
}

src/lib.rs

Lines changed: 138 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,28 +53,29 @@ extern crate core;
5353
extern crate smallvec;
5454

5555
pub use crate::decompose::Decompositions;
56+
pub use crate::decompose_alignment::DecompositionsAlignment;
5657
pub use crate::quick_check::{
5758
is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
5859
is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
5960
IsNormalized,
6061
};
6162
pub use crate::recompose::Recompositions;
63+
pub use crate::recompose_alignment::RecompositionsAlignment;
6264
pub use crate::replace::Replacements;
6365
pub use crate::stream_safe::StreamSafe;
6466
pub use crate::tables::UNICODE_VERSION;
65-
use core::{
66-
str::Chars,
67-
option,
68-
};
67+
use core::{option, str::Chars};
6968

7069
mod no_std_prelude;
7170

7271
mod decompose;
72+
mod decompose_alignment;
7373
mod lookups;
7474
mod normalize;
7575
mod perfect_hash;
7676
mod quick_check;
7777
mod recompose;
78+
mod recompose_alignment;
7879
mod replace;
7980
mod stream_safe;
8081

@@ -169,7 +170,6 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
169170
}
170171
}
171172

172-
173173
impl UnicodeNormalization<option::IntoIter<char>> for char {
174174
#[inline]
175175
fn nfd(self) -> Decompositions<option::IntoIter<char>> {
@@ -233,3 +233,136 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
233233
StreamSafe::new(self)
234234
}
235235
}
236+
237+
/// Methods for iterating over strings while applying Unicode normalizations
238+
/// as described in
239+
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
240+
pub trait UnicodeNormalizationAlignment<I: Iterator<Item = char>> {
241+
/// Returns an iterator over the string in Unicode Normalization Form D
242+
/// (canonical decomposition).
243+
fn nfd(self) -> DecompositionsAlignment<I>;
244+
245+
/// Returns an iterator over the string in Unicode Normalization Form KD
246+
/// (compatibility decomposition).
247+
fn nfkd(self) -> DecompositionsAlignment<I>;
248+
249+
/// An Iterator over the string in Unicode Normalization Form C
250+
/// (canonical decomposition followed by canonical composition).
251+
fn nfc(self) -> RecompositionsAlignment<I>;
252+
253+
/// An Iterator over the string in Unicode Normalization Form KC
254+
/// (compatibility decomposition followed by canonical composition).
255+
fn nfkc(self) -> RecompositionsAlignment<I>;
256+
257+
/// A transformation which replaces CJK Compatibility Ideograph codepoints
258+
/// with normal forms using Standardized Variation Sequences. This is not
259+
/// part of the canonical or compatibility decomposition algorithms, but
260+
/// performing it before those algorithms produces normalized output which
261+
/// better preserves the intent of the original text.
262+
///
263+
/// Note that many systems today ignore variation selectors, so these
264+
/// may not immediately help text display as intended, but they at
265+
/// least preserve the information in a standardized form, giving
266+
/// implementations the option to recognize them.
267+
fn cjk_compat_variants(self) -> Replacements<I>;
268+
269+
/// An Iterator over the string with Conjoining Grapheme Joiner characters
270+
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
271+
fn stream_safe(self) -> StreamSafe<I>;
272+
}
273+
274+
impl<'a> UnicodeNormalizationAlignment<Chars<'a>> for &'a str {
275+
#[inline]
276+
fn nfd(self) -> DecompositionsAlignment<Chars<'a>> {
277+
decompose_alignment::new_canonical(self.chars())
278+
}
279+
280+
#[inline]
281+
fn nfkd(self) -> DecompositionsAlignment<Chars<'a>> {
282+
decompose_alignment::new_compatible(self.chars())
283+
}
284+
285+
#[inline]
286+
fn nfc(self) -> RecompositionsAlignment<Chars<'a>> {
287+
recompose_alignment::new_canonical(self.chars())
288+
}
289+
290+
#[inline]
291+
fn nfkc(self) -> RecompositionsAlignment<Chars<'a>> {
292+
recompose_alignment::new_compatible(self.chars())
293+
}
294+
295+
#[inline]
296+
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
297+
replace::new_cjk_compat_variants(self.chars())
298+
}
299+
300+
#[inline]
301+
fn stream_safe(self) -> StreamSafe<Chars<'a>> {
302+
StreamSafe::new(self.chars())
303+
}
304+
}
305+
306+
impl UnicodeNormalizationAlignment<option::IntoIter<char>> for char {
307+
#[inline]
308+
fn nfd(self) -> DecompositionsAlignment<option::IntoIter<char>> {
309+
decompose_alignment::new_canonical(Some(self).into_iter())
310+
}
311+
312+
#[inline]
313+
fn nfkd(self) -> DecompositionsAlignment<option::IntoIter<char>> {
314+
decompose_alignment::new_compatible(Some(self).into_iter())
315+
}
316+
317+
#[inline]
318+
fn nfc(self) -> RecompositionsAlignment<option::IntoIter<char>> {
319+
recompose_alignment::new_canonical(Some(self).into_iter())
320+
}
321+
322+
#[inline]
323+
fn nfkc(self) -> RecompositionsAlignment<option::IntoIter<char>> {
324+
recompose_alignment::new_compatible(Some(self).into_iter())
325+
}
326+
327+
#[inline]
328+
fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
329+
replace::new_cjk_compat_variants(Some(self).into_iter())
330+
}
331+
332+
#[inline]
333+
fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> {
334+
StreamSafe::new(Some(self).into_iter())
335+
}
336+
}
337+
338+
impl<I: Iterator<Item = char>> UnicodeNormalizationAlignment<I> for I {
339+
#[inline]
340+
fn nfd(self) -> DecompositionsAlignment<I> {
341+
decompose_alignment::new_canonical(self)
342+
}
343+
344+
#[inline]
345+
fn nfkd(self) -> DecompositionsAlignment<I> {
346+
decompose_alignment::new_compatible(self)
347+
}
348+
349+
#[inline]
350+
fn nfc(self) -> RecompositionsAlignment<I> {
351+
recompose_alignment::new_canonical(self)
352+
}
353+
354+
#[inline]
355+
fn nfkc(self) -> RecompositionsAlignment<I> {
356+
recompose_alignment::new_compatible(self)
357+
}
358+
359+
#[inline]
360+
fn cjk_compat_variants(self) -> Replacements<I> {
361+
replace::new_cjk_compat_variants(self)
362+
}
363+
364+
#[inline]
365+
fn stream_safe(self) -> StreamSafe<I> {
366+
StreamSafe::new(self)
367+
}
368+
}

0 commit comments

Comments
 (0)