Skip to content

Add ascii fast path for unicode_word_indices and unicode_words #147

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
[dev-dependencies]
quickcheck = "0.7"
criterion = "0.5"
proptest = "1.7.0"

[[bench]]
name = "chars"
Expand All @@ -36,3 +37,8 @@ harness = false
[[bench]]
name = "word_bounds"
harness = false

[[bench]]
name = "unicode_word_indices"
harness = false

4 changes: 2 additions & 2 deletions benches/chars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
Expand Down
1 change: 1 addition & 0 deletions benches/texts/log.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later
37 changes: 37 additions & 0 deletions benches/unicode_word_indices.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

const FILES: &[&str] = &[
"log", //"arabic",
"english",
//"hindi",
"japanese",
//"korean",
//"mandarin",
//"russian",
//"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for w in text.unicode_word_indices() {
black_box(w);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("unicode_word_indices");

for file in FILES {
let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap();
group.throughput(criterion::Throughput::Bytes(input.len() as u64));
group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| {
b.iter(|| grapheme(content))
});
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);
2 changes: 1 addition & 1 deletion benches/word_bounds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) {
for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}
Expand Down
4 changes: 2 additions & 2 deletions benches/words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
Expand Down
21 changes: 13 additions & 8 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,16 @@
)]
#![no_std]

#[cfg(test)]
extern crate std;

pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use grapheme::{GraphemeIndices, Graphemes};
pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
pub use word::{UWordBoundIndices, UWordBounds};

use crate::word::{UnicodeWordIndices, UnicodeWords};

mod grapheme;
mod sentence;
Expand Down Expand Up @@ -248,7 +253,7 @@ pub trait UnicodeSegmentation {

impl UnicodeSegmentation for str {
#[inline]
fn graphemes(&self, is_extended: bool) -> Graphemes {
fn graphemes(&self, is_extended: bool) -> Graphemes<'_> {
grapheme::new_graphemes(self, is_extended)
}

Expand All @@ -258,32 +263,32 @@ impl UnicodeSegmentation for str {
}

#[inline]
fn unicode_words(&self) -> UnicodeWords {
fn unicode_words(&self) -> UnicodeWords<'_> {
word::new_unicode_words(self)
}

#[inline]
fn unicode_word_indices(&self) -> UnicodeWordIndices {
fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> {
word::new_unicode_word_indices(self)
}

#[inline]
fn split_word_bounds(&self) -> UWordBounds {
fn split_word_bounds(&self) -> UWordBounds<'_> {
word::new_word_bounds(self)
}

#[inline]
fn split_word_bound_indices(&self) -> UWordBoundIndices {
fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> {
word::new_word_bound_indices(self)
}

#[inline]
fn unicode_sentences(&self) -> UnicodeSentences {
fn unicode_sentences(&self) -> UnicodeSentences<'_> {
sentence::new_unicode_sentences(self)
}

#[inline]
fn split_sentence_bounds(&self) -> USentenceBounds {
fn split_sentence_bounds(&self) -> USentenceBounds<'_> {
sentence::new_sentence_bounds(self)
}

Expand Down
Loading