From b0b89a57d5d548155d8a37f5e4b9f11a471bb075 Mon Sep 17 00:00:00 2001 From: Kevin Ballard Date: Wed, 5 Feb 2014 23:56:27 -0800 Subject: [PATCH] Add new function str::from_utf8_lossy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit from_utf8_lossy() takes a byte vector and produces a ~str, converting any invalid UTF-8 sequence into the U+FFFD REPLACEMENT CHARACTER. The replacement follows the guidelines in §5.22 Best Practice for U+FFFD Substitution from the Unicode Standard (Version 6.2)[1], which also matches the WHATWG rules for utf-8 decoding[2]. [1]: http://www.unicode.org/versions/Unicode6.2.0/ch05.pdf [2]: http://encoding.spec.whatwg.org/#utf-8 --- src/libstd/str.rs | 185 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 179 insertions(+), 6 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index e33e2c31c6d49..25e15fc16018e 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -900,16 +900,122 @@ pub struct CharRange { // The first byte is special, only want bottom 5 bits for width 2, 4 bits // for width 3, and 3 bits for width 4 macro_rules! utf8_first_byte( - ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint) + ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32) ) // return the value of $ch updated with continuation byte $byte macro_rules! utf8_acc_cont_byte( - ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint) + ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32) ) static TAG_CONT_U8: u8 = 128u8; +/// Converts a vector of bytes to a new utf-8 string. +/// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER. +/// +/// # Example +/// +/// ```rust +/// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World"); +/// let output = std::str::from_utf8_lossy(input); +/// assert_eq!(output, ~"Hello \uFFFDWorld"); +/// ``` +pub fn from_utf8_lossy(v: &[u8]) -> ~str { + static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8 + let mut i = 0u; + let mut lastgood = 0u; + let total = v.len(); + fn unsafe_get(xs: &[u8], i: uint) -> u8 { + unsafe { *xs.unsafe_ref(i) } + } + fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 { + if i >= total { + 0 + } else { + unsafe_get(xs, i) + } + } + let mut res = with_capacity(total); + + while i < total { + let i_ = i; + let byte = unsafe_get(v, i); + i += 1; + + macro_rules! error(() => { + unsafe { + if lastgood != i_ { + raw::push_bytes(&mut res, v.slice(lastgood, i_)); + } + lastgood = i; + raw::push_bytes(&mut res, REPLACEMENT); + } + }) + + if byte < 128u8 { + // lastgood handles this + } else { + let w = utf8_char_width(byte); + + match w { + 2 => { + if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 { + error!(); + continue; + } + i += 1; + } + 3 => { + match (byte, safe_get(v, i, total)) { + (0xE0 , 0xA0 .. 0xBF) => (), + (0xE1 .. 0xEC, 0x80 .. 0xBF) => (), + (0xED , 0x80 .. 0x9F) => (), + (0xEE .. 0xEF, 0x80 .. 0xBF) => (), + _ => { + error!(); + continue; + } + } + i += 1; + if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 { + error!(); + continue; + } + i += 1; + } + 4 => { + match (byte, safe_get(v, i, total)) { + (0xF0 , 0x90 .. 0xBF) => (), + (0xF1 .. 0xF3, 0x80 .. 0xBF) => (), + (0xF4 , 0x80 .. 0x8F) => (), + _ => { + error!(); + continue; + } + } + i += 1; + if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 { + error!(); + continue; + } + i += 1; + if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 { + error!(); + continue; + } + i += 1; + } + _ => { + error!(); + continue; + } + } + } + } + unsafe { raw::push_bytes(&mut res, v.slice(lastgood, total)) }; + res +} + /// Unsafe operations pub mod raw { use cast; @@ -2211,7 +2317,7 @@ impl<'a> StrSlice<'a> for &'a str { // Multibyte case is a fn to allow char_range_at to inline cleanly fn multibyte_char_range_at(s: &str, i: uint) -> CharRange { - let mut val = s[i] as uint; + let mut val = s[i] as u32; let w = UTF8_CHAR_WIDTH[val] as uint; assert!((w != 0)); @@ -2220,7 +2326,7 @@ impl<'a> StrSlice<'a> for &'a str { if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); } if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); } - return CharRange {ch: unsafe { transmute(val as u32) }, next: i + w}; + return CharRange {ch: unsafe { transmute(val) }, next: i + w}; } return multibyte_char_range_at(*self, i); @@ -2243,7 +2349,7 @@ impl<'a> StrSlice<'a> for &'a str { i -= 1u; } - let mut val = s[i] as uint; + let mut val = s[i] as u32; let w = UTF8_CHAR_WIDTH[val] as uint; assert!((w != 0)); @@ -2252,7 +2358,7 @@ impl<'a> StrSlice<'a> for &'a str { if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); } if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); } - return CharRange {ch: unsafe { transmute(val as u32) }, next: i}; + return CharRange {ch: unsafe { transmute(val) }, next: i}; } return multibyte_char_range_at_reverse(*self, prev); @@ -3834,6 +3940,37 @@ mod tests { assert_eq!(from_utf8_owned(xs), None); } + #[test] + fn test_str_from_utf8_lossy() { + let xs = bytes!("hello"); + assert_eq!(from_utf8_lossy(xs), ~"hello"); + + let xs = bytes!("ศไทย中华Việt Nam"); + assert_eq!(from_utf8_lossy(xs), ~"ศไทย中华Việt Nam"); + + let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye"); + assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD There\uFFFD Goodbye"); + + let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye"); + assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD\uFFFD There\uFFFD Goodbye"); + + let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar"); + assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFD\uFFFDbar"); + + let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz"); + assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFDbaz"); + + let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz"); + assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz"); + + let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar"); + assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar"); + + // surrogates + let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar"); + assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar"); + } + #[test] fn test_to_send_str() { assert_eq!("abcde".to_send_str(), SendStrStatic("abcde")); @@ -3992,6 +4129,42 @@ mod bench { }); } + #[bench] + fn from_utf8_lossy_100_ascii(bh: &mut BenchHarness) { + let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \ + Lorem ipsum dolor sit amet, consectetur. "); + + assert_eq!(100, s.len()); + bh.iter(|| { + let _ = from_utf8_lossy(s); + }); + } + + #[bench] + fn from_utf8_lossy_100_multibyte(bh: &mut BenchHarness) { + let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰"); + assert_eq!(100, s.len()); + bh.iter(|| { + let _ = from_utf8_lossy(s); + }); + } + + #[bench] + fn from_utf8_lossy_invalid(bh: &mut BenchHarness) { + let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye"); + bh.iter(|| { + let _ = from_utf8_lossy(s); + }); + } + + #[bench] + fn from_utf8_lossy_100_invalid(bh: &mut BenchHarness) { + let s = ::vec::from_elem(100, 0xF5u8); + bh.iter(|| { + let _ = from_utf8_lossy(s); + }); + } + #[bench] fn bench_with_capacity(bh: &mut BenchHarness) { bh.iter(|| {