Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run rustfmt #104

Merged
merged 4 commits into from
Feb 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,13 @@ jobs:
run: cargo build --verbose
- name: Run tests
run: cargo test --verbose
fmt:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Rustfmt
run: cargo fmt --check
- name: Verify regenerated files
run: ./scripts/unicode.py && diff tables.rs src/tables.rs
run: ./scripts/unicode.py && diff tables.rs src/tables.rs
18 changes: 9 additions & 9 deletions benches/graphemes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use unicode_segmentation::UnicodeSegmentation;
fn graphemes(c: &mut Criterion, lang: &str, path: &str) {
let text = fs::read_to_string(path).unwrap();

c.bench_function(&format!("graphemes_{}",lang), |bench| {
c.bench_function(&format!("graphemes_{}", lang), |bench| {
bench.iter(|| {
for g in UnicodeSegmentation::graphemes(black_box(&*text), true) {
black_box(g);
Expand All @@ -17,35 +17,35 @@ fn graphemes(c: &mut Criterion, lang: &str, path: &str) {
}

fn graphemes_arabic(c: &mut Criterion) {
graphemes(c, "arabic" ,"benches/texts/arabic.txt");
graphemes(c, "arabic", "benches/texts/arabic.txt");
}

fn graphemes_english(c: &mut Criterion) {
graphemes(c, "english" ,"benches/texts/english.txt");
graphemes(c, "english", "benches/texts/english.txt");
}

fn graphemes_hindi(c: &mut Criterion) {
graphemes(c, "hindi" ,"benches/texts/hindi.txt");
graphemes(c, "hindi", "benches/texts/hindi.txt");
}

fn graphemes_japanese(c: &mut Criterion) {
graphemes(c, "japanese" ,"benches/texts/japanese.txt");
graphemes(c, "japanese", "benches/texts/japanese.txt");
}

fn graphemes_korean(c: &mut Criterion) {
graphemes(c, "korean" ,"benches/texts/korean.txt");
graphemes(c, "korean", "benches/texts/korean.txt");
}

fn graphemes_mandarin(c: &mut Criterion) {
graphemes(c, "mandarin" ,"benches/texts/mandarin.txt");
graphemes(c, "mandarin", "benches/texts/mandarin.txt");
}

fn graphemes_russian(c: &mut Criterion) {
graphemes(c, "russian" ,"benches/texts/russian.txt");
graphemes(c, "russian", "benches/texts/russian.txt");
}

fn graphemes_source_code(c: &mut Criterion) {
graphemes(c, "source_code","benches/texts/source_code.txt");
graphemes(c, "source_code", "benches/texts/source_code.txt");
}

criterion_group!(
Expand Down
2 changes: 1 addition & 1 deletion benches/unicode_words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ extern crate bencher;
extern crate unicode_segmentation;

use bencher::Bencher;
use unicode_segmentation::UnicodeSegmentation;
use std::fs;
use unicode_segmentation::UnicodeSegmentation;

fn unicode_words(bench: &mut Bencher, path: &str) {
let text = fs::read_to_string(path).unwrap();
Expand Down
2 changes: 1 addition & 1 deletion benches/word_bounds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ extern crate bencher;
extern crate unicode_segmentation;

use bencher::Bencher;
use unicode_segmentation::UnicodeSegmentation;
use std::fs;
use unicode_segmentation::UnicodeSegmentation;

fn word_bounds(bench: &mut Bencher, path: &str) {
let text = fs::read_to_string(path).unwrap();
Expand Down
143 changes: 92 additions & 51 deletions src/grapheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ impl<'a> Iterator for GraphemeIndices<'a> {

#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
self.iter
.next()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}

#[inline]
Expand All @@ -61,7 +63,9 @@ impl<'a> Iterator for GraphemeIndices<'a> {
impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
self.iter
.next_back()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}

Expand Down Expand Up @@ -126,7 +130,11 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
if end == self.cursor.cur_cursor() {
return None;
}
let prev = self.cursor_back.prev_boundary(self.string, 0).unwrap().unwrap();
let prev = self
.cursor_back
.prev_boundary(self.string, 0)
.unwrap()
.unwrap();
Some(&self.string[prev..end])
}
}
Expand All @@ -143,7 +151,10 @@ pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {

#[inline]
pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
GraphemeIndices {
start_offset: s.as_ptr() as usize,
iter: new_graphemes(s, is_extended),
}
}

// maybe unify with PairResult?
Expand Down Expand Up @@ -215,7 +226,7 @@ pub enum GraphemeIncomplete {
/// current chunk, so the chunk after that is requested. This will only be
/// returned if the chunk ends before the `len` parameter provided on
/// creation of the cursor.
NextChunk, // requesting chunk following the one given
NextChunk, // requesting chunk following the one given

/// An error returned when the chunk given does not contain the cursor position.
InvalidOffset,
Expand All @@ -224,42 +235,42 @@ pub enum GraphemeIncomplete {
// An enum describing the result from lookup of a pair of categories.
#[derive(PartialEq, Eq)]
enum PairResult {
NotBreak, // definitely not a break
Break, // definitely a break
Extended, // a break iff not in extended mode
Regional, // a break if preceded by an even number of RIS
Emoji, // a break if preceded by emoji base and (Extend)*
NotBreak, // definitely not a break
Break, // definitely a break
Extended, // a break iff not in extended mode
Regional, // a break if preceded by an even number of RIS
Emoji, // a break if preceded by emoji base and (Extend)*
}

#[inline]
fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
use crate::tables::grapheme::GraphemeCat::*;
use self::PairResult::*;
use crate::tables::grapheme::GraphemeCat::*;
match (before, after) {
(GC_CR, GC_LF) => NotBreak, // GB3
(GC_Control, _) => Break, // GB4
(GC_CR, _) => Break, // GB4
(GC_LF, _) => Break, // GB4
(_, GC_Control) => Break, // GB5
(_, GC_CR) => Break, // GB5
(_, GC_LF) => Break, // GB5
(GC_L, GC_L) => NotBreak, // GB6
(GC_L, GC_V) => NotBreak, // GB6
(GC_L, GC_LV) => NotBreak, // GB6
(GC_L, GC_LVT) => NotBreak, // GB6
(GC_LV, GC_V) => NotBreak, // GB7
(GC_LV, GC_T) => NotBreak, // GB7
(GC_V, GC_V) => NotBreak, // GB7
(GC_V, GC_T) => NotBreak, // GB7
(GC_LVT, GC_T) => NotBreak, // GB8
(GC_T, GC_T) => NotBreak, // GB8
(_, GC_Extend) => NotBreak, // GB9
(_, GC_ZWJ) => NotBreak, // GB9
(_, GC_SpacingMark) => Extended, // GB9a
(GC_Prepend, _) => Extended, // GB9b
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
(_, _) => Break, // GB999
(GC_CR, GC_LF) => NotBreak, // GB3
(GC_Control, _) => Break, // GB4
(GC_CR, _) => Break, // GB4
(GC_LF, _) => Break, // GB4
(_, GC_Control) => Break, // GB5
(_, GC_CR) => Break, // GB5
(_, GC_LF) => Break, // GB5
(GC_L, GC_L) => NotBreak, // GB6
(GC_L, GC_V) => NotBreak, // GB6
(GC_L, GC_LV) => NotBreak, // GB6
(GC_L, GC_LVT) => NotBreak, // GB6
(GC_LV, GC_V) => NotBreak, // GB7
(GC_LV, GC_T) => NotBreak, // GB7
(GC_V, GC_V) => NotBreak, // GB7
(GC_V, GC_T) => NotBreak, // GB7
(GC_LVT, GC_T) => NotBreak, // GB8
(GC_T, GC_T) => NotBreak, // GB8
(_, GC_Extend) => NotBreak, // GB9
(_, GC_ZWJ) => NotBreak, // GB9
(_, GC_SpacingMark) => Extended, // GB9a
(GC_Prepend, _) => Extended, // GB9b
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
(_, _) => Break, // GB999
}
}

Expand Down Expand Up @@ -397,17 +408,19 @@ impl GraphemeCursor {
if self.is_extended && chunk_start + chunk.len() == self.offset {
let ch = chunk.chars().rev().next().unwrap();
if self.grapheme_category(ch) == gr::GC_Prepend {
self.decide(false); // GB9b
self.decide(false); // GB9b
return;
}
}
match self.state {
GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
_ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
let ch = chunk.chars().rev().next().unwrap();
self.cat_before = Some(self.grapheme_category(ch));
},
_ => {
if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
let ch = chunk.chars().rev().next().unwrap();
self.cat_before = Some(self.grapheme_category(ch));
}
}
}
}

Expand Down Expand Up @@ -515,17 +528,21 @@ impl GraphemeCursor {
/// cursor.set_cursor(12);
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
/// ```
pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
pub fn is_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<bool, GraphemeIncomplete> {
use crate::tables::grapheme as gr;
if self.state == GraphemeState::Break {
return Ok(true)
return Ok(true);
}
if self.state == GraphemeState::NotBreak {
return Ok(false)
return Ok(false);
}
if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
return Err(GraphemeIncomplete::InvalidOffset)
return Err(GraphemeIncomplete::InvalidOffset);
}
}
if let Some(pre_context_offset) = self.pre_context_offset {
Expand Down Expand Up @@ -606,7 +623,11 @@ impl GraphemeCursor {
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
/// ```
pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
pub fn next_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == self.len {
return Ok(None);
}
Expand Down Expand Up @@ -681,7 +702,11 @@ impl GraphemeCursor {
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
/// ```
pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
pub fn prev_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == 0 {
return Ok(None);
}
Expand All @@ -702,7 +727,11 @@ impl GraphemeCursor {
self.cat_after = self.cat_before.take();
self.state = GraphemeState::Unknown;
if let Some(ris_count) = self.ris_count {
self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None };
self.ris_count = if ris_count > 0 {
Some(ris_count - 1)
} else {
None
};
}
if let Some(prev_ch) = iter.next() {
ch = prev_ch;
Expand All @@ -729,7 +758,10 @@ impl GraphemeCursor {
fn test_grapheme_cursor_ris_precontext() {
let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
let mut c = GraphemeCursor::new(8, s.len(), true);
assert_eq!(c.is_boundary(&s[4..], 4), Err(GraphemeIncomplete::PreContext(4)));
assert_eq!(
c.is_boundary(&s[4..], 4),
Err(GraphemeIncomplete::PreContext(4))
);
c.provide_context(&s[..4], 0);
assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
}
Expand All @@ -738,7 +770,10 @@ fn test_grapheme_cursor_ris_precontext() {
fn test_grapheme_cursor_chunk_start_require_precontext() {
let s = "\r\n";
let mut c = GraphemeCursor::new(1, s.len(), true);
assert_eq!(c.is_boundary(&s[1..], 1), Err(GraphemeIncomplete::PreContext(1)));
assert_eq!(
c.is_boundary(&s[1..], 1),
Err(GraphemeIncomplete::PreContext(1))
);
c.provide_context(&s[..1], 0);
assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
}
Expand All @@ -747,14 +782,20 @@ fn test_grapheme_cursor_chunk_start_require_precontext() {
fn test_grapheme_cursor_prev_boundary() {
let s = "abcd";
let mut c = GraphemeCursor::new(3, s.len(), true);
assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
assert_eq!(
c.prev_boundary(&s[2..], 2),
Err(GraphemeIncomplete::PrevChunk)
);
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
}

#[test]
fn test_grapheme_cursor_prev_boundary_chunk_start() {
let s = "abcd";
let mut c = GraphemeCursor::new(2, s.len(), true);
assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
assert_eq!(
c.prev_boundary(&s[2..], 2),
Err(GraphemeIncomplete::PrevChunk)
);
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
}
16 changes: 9 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@
//! ```

#![deny(missing_docs, unsafe_code)]
#![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]

#![doc(
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
)]
#![no_std]

#[cfg(test)]
Expand All @@ -63,16 +64,17 @@ extern crate std;
#[macro_use]
extern crate quickcheck;

pub use grapheme::{Graphemes, GraphemeIndices};
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use grapheme::{GraphemeIndices, Graphemes};
pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices};
pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};

mod grapheme;
#[rustfmt::skip]
mod tables;
mod word;
mod sentence;
mod word;

#[cfg(test)]
mod test;
Expand Down
Loading