From 0fbdeeffd3e9d4d811d119e19a9657c43d644f37 Mon Sep 17 00:00:00 2001 From: Isabel Atkinson Date: Wed, 11 Sep 2024 14:39:20 -0400 Subject: [PATCH] RUST-2023 Add wrapper type for utf-8 lossy deserialization (#497) --- src/de/raw.rs | 7 ++- src/document.rs | 1 + src/lib.rs | 5 +- src/serde_helpers.rs | 43 ++++++++++++++++ src/tests/modules/serializer_deserializer.rs | 1 + src/tests/serde_helpers.rs | 54 +++++++++++++++++++- src/tests/spec/corpus.rs | 4 ++ 7 files changed, 111 insertions(+), 4 deletions(-) diff --git a/src/de/raw.rs b/src/de/raw.rs index 6501c325..58a2d2bc 100644 --- a/src/de/raw.rs +++ b/src/de/raw.rs @@ -18,7 +18,7 @@ use crate::{ RAW_BSON_NEWTYPE, RAW_DOCUMENT_NEWTYPE, }, - serde_helpers::HUMAN_READABLE_NEWTYPE, + serde_helpers::{HUMAN_READABLE_NEWTYPE, UTF8_LOSSY_NEWTYPE}, spec::{BinarySubtype, ElementType}, uuid::UUID_NEWTYPE_NAME, DateTime, @@ -297,6 +297,11 @@ impl<'de> serde::de::Deserializer<'de> for Deserializer<'de> { inner.options.human_readable = true; visitor.visit_newtype_struct(inner) } + UTF8_LOSSY_NEWTYPE => { + let mut inner = self; + inner.options.utf8_lossy = true; + visitor.visit_newtype_struct(inner) + } _ => visitor.visit_newtype_struct(self), } } diff --git a/src/document.rs b/src/document.rs index ebfcc622..3a5f189f 100644 --- a/src/document.rs +++ b/src/document.rs @@ -599,6 +599,7 @@ impl Document { /// This is mainly useful when reading raw BSON returned from a MongoDB server, which /// in rare cases can contain invalidly truncated strings (). /// For most use cases, `Document::from_reader` can be used instead. + #[deprecated = "use bson::serde_helpers::Utf8LossyDeserialization"] pub fn from_reader_utf8_lossy(mut reader: R) -> crate::de::Result { Self::decode(&mut reader, true) } diff --git a/src/lib.rs b/src/lib.rs index e68cbd68..e7ff11b3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -295,9 +295,7 @@ pub use self::{ from_document, from_document_with_options, from_reader, - from_reader_utf8_lossy, from_slice, - from_slice_utf8_lossy, Deserializer, DeserializerOptions, }, @@ -328,6 +326,9 @@ pub use self::{ uuid::{Uuid, UuidRepresentation}, }; +#[allow(deprecated)] +pub use self::de::{from_reader_utf8_lossy, from_slice_utf8_lossy,}; + #[macro_use] mod macros; pub mod binary; diff --git a/src/serde_helpers.rs b/src/serde_helpers.rs index eb86578f..56702b8e 100644 --- a/src/serde_helpers.rs +++ b/src/serde_helpers.rs @@ -886,3 +886,46 @@ impl<'de, T: Deserialize<'de>> Deserialize<'de> for HumanReadable { deserializer.deserialize_newtype_struct(HUMAN_READABLE_NEWTYPE, V(PhantomData)) } } + +/// Wrapper type for deserializing BSON bytes with invalid UTF-8 sequences. +/// +/// Any invalid UTF-8 strings contained in the wrapped type will be replaced with the Unicode +/// replacement character. This wrapper type only has an effect when deserializing from BSON bytes. +/// +/// This wrapper type has no impact on serialization. Serializing a `Utf8LossyDeserialization` +/// will call the `serialize` method for the wrapped `T`. +#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)] +pub struct Utf8LossyDeserialization(pub T); + +pub(crate) const UTF8_LOSSY_NEWTYPE: &str = "$__bson_private_utf8_lossy"; + +impl Serialize for Utf8LossyDeserialization { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + self.0.serialize(serializer) + } +} + +impl<'de, T: Deserialize<'de>> Deserialize<'de> for Utf8LossyDeserialization { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct V(PhantomData T>); + impl<'de, T: Deserialize<'de>> Visitor<'de> for V { + type Value = Utf8LossyDeserialization; + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("Utf8Lossy wrapper") + } + fn visit_newtype_struct(self, deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + T::deserialize(deserializer).map(Utf8LossyDeserialization) + } + } + deserializer.deserialize_newtype_struct(UTF8_LOSSY_NEWTYPE, V(PhantomData)) + } +} diff --git a/src/tests/modules/serializer_deserializer.rs b/src/tests/modules/serializer_deserializer.rs index 2f476841..1083e985 100644 --- a/src/tests/modules/serializer_deserializer.rs +++ b/src/tests/modules/serializer_deserializer.rs @@ -73,6 +73,7 @@ fn test_encode_decode_utf8_string_invalid() { doc.to_writer(&mut buf).unwrap(); let expected = doc! { "key": "��" }; + #[allow(deprecated)] let decoded = Document::from_reader_utf8_lossy(&mut Cursor::new(buf)).unwrap(); assert_eq!(decoded, expected); } diff --git a/src/tests/serde_helpers.rs b/src/tests/serde_helpers.rs index 2e6453c1..805aa93b 100644 --- a/src/tests/serde_helpers.rs +++ b/src/tests/serde_helpers.rs @@ -1,6 +1,11 @@ +use core::str; + use serde::{de::Visitor, Deserialize, Serialize}; -use crate::serde_helpers::HumanReadable; +use crate::{ + from_slice, + serde_helpers::{HumanReadable, Utf8LossyDeserialization}, +}; #[test] fn human_readable_wrapper() { @@ -135,3 +140,50 @@ fn human_readable_wrapper() { let raw_tripped: Data = crate::from_slice(&bytes).unwrap(); assert_eq!(&raw_tripped, &expected); } + +#[test] +#[allow(dead_code)] // suppress warning for unread fields +fn utf8_lossy_wrapper() { + let invalid_bytes = b"\x80\xae".to_vec(); + let invalid_string = unsafe { String::from_utf8_unchecked(invalid_bytes) }; + + let both_strings_invalid_bytes = + rawdoc! { "s1": invalid_string.clone(), "s2": invalid_string.clone() }.into_bytes(); + let first_string_invalid_bytes = + rawdoc! { "s1": invalid_string.clone(), "s2": ":)" }.into_bytes(); + + let expected_replacement = "��".to_string(); + + #[derive(Debug, Deserialize)] + struct NoUtf8Lossy { + s1: String, + s2: String, + } + + from_slice::(&both_strings_invalid_bytes).unwrap_err(); + + let s = from_slice::>(&both_strings_invalid_bytes) + .unwrap() + .0; + assert_eq!(s.s1, expected_replacement); + assert_eq!(s.s2, expected_replacement); + + #[derive(Debug, Deserialize)] + struct FirstStringUtf8Lossy { + s1: Utf8LossyDeserialization, + s2: String, + } + + let s = from_slice::(&first_string_invalid_bytes).unwrap(); + assert_eq!(s.s1.0, expected_replacement); + assert_eq!(&s.s2, ":)"); + + from_slice::(&both_strings_invalid_bytes).unwrap_err(); + + let s = + from_slice::>(&both_strings_invalid_bytes) + .unwrap() + .0; + assert_eq!(s.s1.0, expected_replacement); + assert_eq!(s.s2, expected_replacement); +} diff --git a/src/tests/spec/corpus.rs b/src/tests/spec/corpus.rs index 95e63988..6df2487e 100644 --- a/src/tests/spec/corpus.rs +++ b/src/tests/spec/corpus.rs @@ -7,6 +7,7 @@ use std::{ use crate::{ raw::{RawBsonRef, RawDocument}, + serde_helpers::Utf8LossyDeserialization, tests::LOCK, Bson, Document, @@ -549,12 +550,15 @@ fn run_test(test: TestFile) { crate::from_reader::<_, Document>(bson.as_slice()).expect_err(description.as_str()); if decode_error.description.contains("invalid UTF-8") { + #[allow(deprecated)] crate::from_reader_utf8_lossy::<_, Document>(bson.as_slice()).unwrap_or_else(|err| { panic!( "{}: utf8_lossy should not fail (failed with {:?})", description, err ) }); + crate::from_slice::>(bson.as_slice()) + .expect(&description); } }