rust-lang · bors · Jul 14, 2016 · May 27, 2016 · BurntSushi · Jul 6, 2016
diff --git a/src/libcore/char.rs b/src/libcore/char.rs
@@ -676,3 +676,50 @@ impl Iterator for EncodeUtf16 {
         self.as_slice().iter().size_hint()
     }
 }
+
+
+/// An iterator over an iterator of bytes of the characters the bytes represent
+/// as UTF-8
+#[unstable(feature = "decode_utf8", issue = "33906")]
+#[derive(Clone, Debug)]
+pub struct DecodeUtf8<I: Iterator<Item = u8>>(::iter::Peekable<I>);
+
+/// Decodes an `Iterator` of bytes as UTF-8.
+#[unstable(feature = "decode_utf8", issue = "33906")]
+#[inline]
+pub fn decode_utf8<I: IntoIterator<Item = u8>>(i: I) -> DecodeUtf8<I::IntoIter> {
+    DecodeUtf8(i.into_iter().peekable())
+}
+
+/// `<DecodeUtf8 as Iterator>::next` returns this for an invalid input sequence.
+#[unstable(feature = "decode_utf8", issue = "33906")]
+#[derive(PartialEq, Debug)]
+pub struct InvalidSequence(());
+
+#[unstable(feature = "decode_utf8", issue = "33906")]
+impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
+    type Item = Result<char, InvalidSequence>;
+    #[inline]
+    fn next(&mut self) -> Option<Result<char, InvalidSequence>> {
+        self.0.next().map(|b| {
+            if b & 0x80 == 0 { Ok(b as char) } else {
+                let l = (!b).leading_zeros() as usize; // number of bytes in UTF-8 representation
+                if l < 2 || l > 6 { return Err(InvalidSequence(())) };
+                let mut x = (b as u32) & (0x7F >> l);
+                for _ in 0..l-1 {
+                    match self.0.peek() {
+                        Some(&b) if b & 0xC0 == 0x80 => {
+                            self.0.next();
+                            x = (x << 6) | (b as u32) & 0x3F;
+                        },
+                        _ => return Err(InvalidSequence(())),
+                    }
+                }
+                match from_u32(x) {
+                    Some(x) if l == x.len_utf8() => Ok(x),
+                    _ => Err(InvalidSequence(())),
+                }
+            }
+        })
+    }
+}
diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs
@@ -302,3 +302,32 @@ fn eu_iterator_specializations() {
     check('\u{12340}');
     check('\u{10FFFF}');
 }
+
+#[test]
+fn test_decode_utf8() {
+    use core::char::*;
+    use core::iter::FromIterator;
+
+    for &(str, bs) in [("", &[] as &[u8]),
+                       ("A", &[0x41u8] as &[u8]),
+                       ("�", &[0xC1u8, 0x81u8] as &[u8]),
+                       ("♥", &[0xE2u8, 0x99u8, 0xA5u8]),
+                       ("♥A", &[0xE2u8, 0x99u8, 0xA5u8, 0x41u8] as &[u8]),
+                       ("�", &[0xE2u8, 0x99u8] as &[u8]),
+                       ("�A", &[0xE2u8, 0x99u8, 0x41u8] as &[u8]),
+                       ("�", &[0xC0u8] as &[u8]),
+                       ("�A", &[0xC0u8, 0x41u8] as &[u8]),
+                       ("�", &[0x80u8] as &[u8]),
+                       ("�A", &[0x80u8, 0x41u8] as &[u8]),
+                       ("�", &[0xFEu8] as &[u8]),
+                       ("�A", &[0xFEu8, 0x41u8] as &[u8]),
+                       ("�", &[0xFFu8] as &[u8]),
+                       ("�A", &[0xFFu8, 0x41u8] as &[u8])].into_iter() {
+        assert!(Iterator::eq(str.chars(),
+                             decode_utf8(bs.into_iter().map(|&b|b))
+                                 .map(|r_b| r_b.unwrap_or('\u{FFFD}'))),
+                "chars = {}, bytes = {:?}, decoded = {:?}", str, bs,
+                Vec::from_iter(decode_utf8(bs.into_iter().map(|&b|b))
+                                   .map(|r_b| r_b.unwrap_or('\u{FFFD}'))));
+    }
+}
diff --git a/src/libcoretest/lib.rs b/src/libcoretest/lib.rs
@@ -18,6 +18,7 @@
 #![feature(core_private_bignum)]
 #![feature(core_private_diy_float)]
 #![feature(dec2flt)]
+#![feature(decode_utf8)]
 #![feature(fixed_size_array)]
 #![feature(float_extras)]
 #![feature(flt2dec)]

diff --git a/src/librustc_unicode/char.rs b/src/librustc_unicode/char.rs
@@ -39,6 +39,8 @@ pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked};
 pub use core::char::{EncodeUtf16, EncodeUtf8, EscapeDefault, EscapeUnicode};
 
 // unstable reexports
+#[unstable(feature = "decode_utf8", issue = "33906")]
+pub use core::char::{DecodeUtf8, decode_utf8};
 #[unstable(feature = "unicode", issue = "27783")]
 pub use tables::UNICODE_VERSION;
 

diff --git a/src/librustc_unicode/lib.rs b/src/librustc_unicode/lib.rs
@@ -33,6 +33,7 @@
 #![no_std]
 
 #![feature(core_char_ext)]
+#![feature(decode_utf8)]
 #![feature(lang_items)]
 #![feature(staged_api)]
 #![feature(unicode)]