Skip to content

Commit

Permalink
RUST-2023 Add wrapper type for utf-8 lossy deserialization (#497)
Browse files Browse the repository at this point in the history
  • Loading branch information
isabelatkinson authored Sep 11, 2024
1 parent 8e0fb3b commit 0fbdeef
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 4 deletions.
7 changes: 6 additions & 1 deletion src/de/raw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use crate::{
RAW_BSON_NEWTYPE,
RAW_DOCUMENT_NEWTYPE,
},
serde_helpers::HUMAN_READABLE_NEWTYPE,
serde_helpers::{HUMAN_READABLE_NEWTYPE, UTF8_LOSSY_NEWTYPE},
spec::{BinarySubtype, ElementType},
uuid::UUID_NEWTYPE_NAME,
DateTime,
Expand Down Expand Up @@ -297,6 +297,11 @@ impl<'de> serde::de::Deserializer<'de> for Deserializer<'de> {
inner.options.human_readable = true;
visitor.visit_newtype_struct(inner)
}
UTF8_LOSSY_NEWTYPE => {
let mut inner = self;
inner.options.utf8_lossy = true;
visitor.visit_newtype_struct(inner)
}
_ => visitor.visit_newtype_struct(self),
}
}
Expand Down
1 change: 1 addition & 0 deletions src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,7 @@ impl Document {
/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
/// in rare cases can contain invalidly truncated strings (<https://jira.mongodb.org/browse/SERVER-24007>).
/// For most use cases, `Document::from_reader` can be used instead.
#[deprecated = "use bson::serde_helpers::Utf8LossyDeserialization"]
pub fn from_reader_utf8_lossy<R: Read>(mut reader: R) -> crate::de::Result<Document> {
Self::decode(&mut reader, true)
}
Expand Down
5 changes: 3 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -295,9 +295,7 @@ pub use self::{
from_document,
from_document_with_options,
from_reader,
from_reader_utf8_lossy,
from_slice,
from_slice_utf8_lossy,
Deserializer,
DeserializerOptions,
},
Expand Down Expand Up @@ -328,6 +326,9 @@ pub use self::{
uuid::{Uuid, UuidRepresentation},
};

#[allow(deprecated)]
pub use self::de::{from_reader_utf8_lossy, from_slice_utf8_lossy,};

#[macro_use]
mod macros;
pub mod binary;
Expand Down
43 changes: 43 additions & 0 deletions src/serde_helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -886,3 +886,46 @@ impl<'de, T: Deserialize<'de>> Deserialize<'de> for HumanReadable<T> {
deserializer.deserialize_newtype_struct(HUMAN_READABLE_NEWTYPE, V(PhantomData))
}
}

/// Wrapper type for deserializing BSON bytes with invalid UTF-8 sequences.
///
/// Any invalid UTF-8 strings contained in the wrapped type will be replaced with the Unicode
/// replacement character. This wrapper type only has an effect when deserializing from BSON bytes.
///
/// This wrapper type has no impact on serialization. Serializing a `Utf8LossyDeserialization<T>`
/// will call the `serialize` method for the wrapped `T`.
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)]
pub struct Utf8LossyDeserialization<T>(pub T);

pub(crate) const UTF8_LOSSY_NEWTYPE: &str = "$__bson_private_utf8_lossy";

impl<T: Serialize> Serialize for Utf8LossyDeserialization<T> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
self.0.serialize(serializer)
}
}

impl<'de, T: Deserialize<'de>> Deserialize<'de> for Utf8LossyDeserialization<T> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct V<T>(PhantomData<fn() -> T>);
impl<'de, T: Deserialize<'de>> Visitor<'de> for V<T> {
type Value = Utf8LossyDeserialization<T>;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
formatter.write_str("Utf8Lossy wrapper")
}
fn visit_newtype_struct<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
where
D: serde::Deserializer<'de>,
{
T::deserialize(deserializer).map(Utf8LossyDeserialization)
}
}
deserializer.deserialize_newtype_struct(UTF8_LOSSY_NEWTYPE, V(PhantomData))
}
}
1 change: 1 addition & 0 deletions src/tests/modules/serializer_deserializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ fn test_encode_decode_utf8_string_invalid() {
doc.to_writer(&mut buf).unwrap();

let expected = doc! { "key": "��" };
#[allow(deprecated)]
let decoded = Document::from_reader_utf8_lossy(&mut Cursor::new(buf)).unwrap();
assert_eq!(decoded, expected);
}
Expand Down
54 changes: 53 additions & 1 deletion src/tests/serde_helpers.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
use core::str;

use serde::{de::Visitor, Deserialize, Serialize};

use crate::serde_helpers::HumanReadable;
use crate::{
from_slice,
serde_helpers::{HumanReadable, Utf8LossyDeserialization},
};

#[test]
fn human_readable_wrapper() {
Expand Down Expand Up @@ -135,3 +140,50 @@ fn human_readable_wrapper() {
let raw_tripped: Data = crate::from_slice(&bytes).unwrap();
assert_eq!(&raw_tripped, &expected);
}

#[test]
#[allow(dead_code)] // suppress warning for unread fields
fn utf8_lossy_wrapper() {
let invalid_bytes = b"\x80\xae".to_vec();
let invalid_string = unsafe { String::from_utf8_unchecked(invalid_bytes) };

let both_strings_invalid_bytes =
rawdoc! { "s1": invalid_string.clone(), "s2": invalid_string.clone() }.into_bytes();
let first_string_invalid_bytes =
rawdoc! { "s1": invalid_string.clone(), "s2": ":)" }.into_bytes();

let expected_replacement = "��".to_string();

#[derive(Debug, Deserialize)]
struct NoUtf8Lossy {
s1: String,
s2: String,
}

from_slice::<NoUtf8Lossy>(&both_strings_invalid_bytes).unwrap_err();

let s = from_slice::<Utf8LossyDeserialization<NoUtf8Lossy>>(&both_strings_invalid_bytes)
.unwrap()
.0;
assert_eq!(s.s1, expected_replacement);
assert_eq!(s.s2, expected_replacement);

#[derive(Debug, Deserialize)]
struct FirstStringUtf8Lossy {
s1: Utf8LossyDeserialization<String>,
s2: String,
}

let s = from_slice::<FirstStringUtf8Lossy>(&first_string_invalid_bytes).unwrap();
assert_eq!(s.s1.0, expected_replacement);
assert_eq!(&s.s2, ":)");

from_slice::<FirstStringUtf8Lossy>(&both_strings_invalid_bytes).unwrap_err();

let s =
from_slice::<Utf8LossyDeserialization<FirstStringUtf8Lossy>>(&both_strings_invalid_bytes)
.unwrap()
.0;
assert_eq!(s.s1.0, expected_replacement);
assert_eq!(s.s2, expected_replacement);
}
4 changes: 4 additions & 0 deletions src/tests/spec/corpus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::{

use crate::{
raw::{RawBsonRef, RawDocument},
serde_helpers::Utf8LossyDeserialization,
tests::LOCK,
Bson,
Document,
Expand Down Expand Up @@ -549,12 +550,15 @@ fn run_test(test: TestFile) {
crate::from_reader::<_, Document>(bson.as_slice()).expect_err(description.as_str());

if decode_error.description.contains("invalid UTF-8") {
#[allow(deprecated)]
crate::from_reader_utf8_lossy::<_, Document>(bson.as_slice()).unwrap_or_else(|err| {
panic!(
"{}: utf8_lossy should not fail (failed with {:?})",
description, err
)
});
crate::from_slice::<Utf8LossyDeserialization<Document>>(bson.as_slice())
.expect(&description);
}
}

Expand Down

0 comments on commit 0fbdeef

Please sign in to comment.