Skip to content

Commit

Permalink
add core::char::DecodeUtf8
Browse files Browse the repository at this point in the history
  • Loading branch information
M Farkas-Dyck committed Jul 14, 2016
1 parent fe96928 commit 837029f
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 0 deletions.
47 changes: 47 additions & 0 deletions src/libcore/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -676,3 +676,50 @@ impl Iterator for EncodeUtf16 {
self.as_slice().iter().size_hint()
}
}


/// An iterator over an iterator of bytes of the characters the bytes represent
/// as UTF-8
#[unstable(feature = "decode_utf8", issue = "33906")]
#[derive(Clone, Debug)]
pub struct DecodeUtf8<I: Iterator<Item = u8>>(::iter::Peekable<I>);

/// Decodes an `Iterator` of bytes as UTF-8.
#[unstable(feature = "decode_utf8", issue = "33906")]
#[inline]
pub fn decode_utf8<I: IntoIterator<Item = u8>>(i: I) -> DecodeUtf8<I::IntoIter> {
DecodeUtf8(i.into_iter().peekable())
}

/// `<DecodeUtf8 as Iterator>::next` returns this for an invalid input sequence.
#[unstable(feature = "decode_utf8", issue = "33906")]
#[derive(PartialEq, Debug)]
pub struct InvalidSequence(());

#[unstable(feature = "decode_utf8", issue = "33906")]
impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
type Item = Result<char, InvalidSequence>;
#[inline]
fn next(&mut self) -> Option<Result<char, InvalidSequence>> {
self.0.next().map(|b| {
if b & 0x80 == 0 { Ok(b as char) } else {
let l = (!b).leading_zeros() as usize; // number of bytes in UTF-8 representation
if l < 2 || l > 6 { return Err(InvalidSequence(())) };
let mut x = (b as u32) & (0x7F >> l);
for _ in 0..l-1 {
match self.0.peek() {
Some(&b) if b & 0xC0 == 0x80 => {
self.0.next();
x = (x << 6) | (b as u32) & 0x3F;
},
_ => return Err(InvalidSequence(())),
}
}
match from_u32(x) {
Some(x) if l == x.len_utf8() => Ok(x),
_ => Err(InvalidSequence(())),
}
}
})
}
}
29 changes: 29 additions & 0 deletions src/libcoretest/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,3 +302,32 @@ fn eu_iterator_specializations() {
check('\u{12340}');
check('\u{10FFFF}');
}

#[test]
fn test_decode_utf8() {
use core::char::*;
use core::iter::FromIterator;

for &(str, bs) in [("", &[] as &[u8]),
("A", &[0x41u8] as &[u8]),
("�", &[0xC1u8, 0x81u8] as &[u8]),
("♥", &[0xE2u8, 0x99u8, 0xA5u8]),
("♥A", &[0xE2u8, 0x99u8, 0xA5u8, 0x41u8] as &[u8]),
("�", &[0xE2u8, 0x99u8] as &[u8]),
("�A", &[0xE2u8, 0x99u8, 0x41u8] as &[u8]),
("�", &[0xC0u8] as &[u8]),
("�A", &[0xC0u8, 0x41u8] as &[u8]),
("�", &[0x80u8] as &[u8]),
("�A", &[0x80u8, 0x41u8] as &[u8]),
("�", &[0xFEu8] as &[u8]),
("�A", &[0xFEu8, 0x41u8] as &[u8]),
("�", &[0xFFu8] as &[u8]),
("�A", &[0xFFu8, 0x41u8] as &[u8])].into_iter() {
assert!(Iterator::eq(str.chars(),
decode_utf8(bs.into_iter().map(|&b|b))
.map(|r_b| r_b.unwrap_or('\u{FFFD}'))),
"chars = {}, bytes = {:?}, decoded = {:?}", str, bs,
Vec::from_iter(decode_utf8(bs.into_iter().map(|&b|b))
.map(|r_b| r_b.unwrap_or('\u{FFFD}'))));
}
}
1 change: 1 addition & 0 deletions src/libcoretest/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#![feature(core_private_bignum)]
#![feature(core_private_diy_float)]
#![feature(dec2flt)]
#![feature(decode_utf8)]
#![feature(fixed_size_array)]
#![feature(float_extras)]
#![feature(flt2dec)]
Expand Down
2 changes: 2 additions & 0 deletions src/librustc_unicode/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked};
pub use core::char::{EncodeUtf16, EncodeUtf8, EscapeDefault, EscapeUnicode};

// unstable reexports
#[unstable(feature = "decode_utf8", issue = "33906")]
pub use core::char::{DecodeUtf8, decode_utf8};
#[unstable(feature = "unicode", issue = "27783")]
pub use tables::UNICODE_VERSION;

Expand Down
1 change: 1 addition & 0 deletions src/librustc_unicode/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#![no_std]

#![feature(core_char_ext)]
#![feature(decode_utf8)]
#![feature(lang_items)]
#![feature(staged_api)]
#![feature(unicode)]
Expand Down

0 comments on commit 837029f

Please sign in to comment.