diff --git a/src/libcore/char.rs b/src/libcore/char.rs index d80b456181ae4..0e7f04c775825 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -676,3 +676,50 @@ impl Iterator for EncodeUtf16 { self.as_slice().iter().size_hint() } } + + +/// An iterator over an iterator of bytes of the characters the bytes represent +/// as UTF-8 +#[unstable(feature = "decode_utf8", issue = "33906")] +#[derive(Clone, Debug)] +pub struct DecodeUtf8>(::iter::Peekable); + +/// Decodes an `Iterator` of bytes as UTF-8. +#[unstable(feature = "decode_utf8", issue = "33906")] +#[inline] +pub fn decode_utf8>(i: I) -> DecodeUtf8 { + DecodeUtf8(i.into_iter().peekable()) +} + +/// `::next` returns this for an invalid input sequence. +#[unstable(feature = "decode_utf8", issue = "33906")] +#[derive(PartialEq, Debug)] +pub struct InvalidSequence(()); + +#[unstable(feature = "decode_utf8", issue = "33906")] +impl> Iterator for DecodeUtf8 { + type Item = Result; + #[inline] + fn next(&mut self) -> Option> { + self.0.next().map(|b| { + if b & 0x80 == 0 { Ok(b as char) } else { + let l = (!b).leading_zeros() as usize; // number of bytes in UTF-8 representation + if l < 2 || l > 6 { return Err(InvalidSequence(())) }; + let mut x = (b as u32) & (0x7F >> l); + for _ in 0..l-1 { + match self.0.peek() { + Some(&b) if b & 0xC0 == 0x80 => { + self.0.next(); + x = (x << 6) | (b as u32) & 0x3F; + }, + _ => return Err(InvalidSequence(())), + } + } + match from_u32(x) { + Some(x) if l == x.len_utf8() => Ok(x), + _ => Err(InvalidSequence(())), + } + } + }) + } +} diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs index 7da876b945947..c8906fed3d2fa 100644 --- a/src/libcoretest/char.rs +++ b/src/libcoretest/char.rs @@ -302,3 +302,32 @@ fn eu_iterator_specializations() { check('\u{12340}'); check('\u{10FFFF}'); } + +#[test] +fn test_decode_utf8() { + use core::char::*; + use core::iter::FromIterator; + + for &(str, bs) in [("", &[] as &[u8]), + ("A", &[0x41u8] as &[u8]), + ("�", &[0xC1u8, 0x81u8] as &[u8]), + ("♥", &[0xE2u8, 0x99u8, 0xA5u8]), + ("♥A", &[0xE2u8, 0x99u8, 0xA5u8, 0x41u8] as &[u8]), + ("�", &[0xE2u8, 0x99u8] as &[u8]), + ("�A", &[0xE2u8, 0x99u8, 0x41u8] as &[u8]), + ("�", &[0xC0u8] as &[u8]), + ("�A", &[0xC0u8, 0x41u8] as &[u8]), + ("�", &[0x80u8] as &[u8]), + ("�A", &[0x80u8, 0x41u8] as &[u8]), + ("�", &[0xFEu8] as &[u8]), + ("�A", &[0xFEu8, 0x41u8] as &[u8]), + ("�", &[0xFFu8] as &[u8]), + ("�A", &[0xFFu8, 0x41u8] as &[u8])].into_iter() { + assert!(Iterator::eq(str.chars(), + decode_utf8(bs.into_iter().map(|&b|b)) + .map(|r_b| r_b.unwrap_or('\u{FFFD}'))), + "chars = {}, bytes = {:?}, decoded = {:?}", str, bs, + Vec::from_iter(decode_utf8(bs.into_iter().map(|&b|b)) + .map(|r_b| r_b.unwrap_or('\u{FFFD}')))); + } +} diff --git a/src/libcoretest/lib.rs b/src/libcoretest/lib.rs index 88d73df937f7e..d46dcdb51e85e 100644 --- a/src/libcoretest/lib.rs +++ b/src/libcoretest/lib.rs @@ -18,6 +18,7 @@ #![feature(core_private_bignum)] #![feature(core_private_diy_float)] #![feature(dec2flt)] +#![feature(decode_utf8)] #![feature(fixed_size_array)] #![feature(float_extras)] #![feature(flt2dec)] diff --git a/src/librustc_unicode/char.rs b/src/librustc_unicode/char.rs index f570375de5ea1..b1ec96238be78 100644 --- a/src/librustc_unicode/char.rs +++ b/src/librustc_unicode/char.rs @@ -39,6 +39,8 @@ pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked}; pub use core::char::{EncodeUtf16, EncodeUtf8, EscapeDefault, EscapeUnicode}; // unstable reexports +#[unstable(feature = "decode_utf8", issue = "33906")] +pub use core::char::{DecodeUtf8, decode_utf8}; #[unstable(feature = "unicode", issue = "27783")] pub use tables::UNICODE_VERSION; diff --git a/src/librustc_unicode/lib.rs b/src/librustc_unicode/lib.rs index b03d7ee79e89c..f91a754ab57db 100644 --- a/src/librustc_unicode/lib.rs +++ b/src/librustc_unicode/lib.rs @@ -33,6 +33,7 @@ #![no_std] #![feature(core_char_ext)] +#![feature(decode_utf8)] #![feature(lang_items)] #![feature(staged_api)] #![feature(unicode)]