Skip to content

Commit

Permalink
bom: add option to disable bom sniffing
Browse files Browse the repository at this point in the history
This makes it possible to use the transcoder to pass through its
bytes unconditionally without any transcoding. This is the same as
not using it at all, but makes consumer code organization a bit
simpler if this is linked back to a runtime configuration option.

This addresses part of the work toward completing
BurntSushi/ripgrep#1207
  • Loading branch information
LesnyRumcajs authored and BurntSushi committed Mar 3, 2019
1 parent ed15680 commit b840b09
Showing 1 changed file with 53 additions and 2 deletions.
55 changes: 53 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ pub struct DecodeReaderBytesBuilder {
utf8_passthru: bool,
bom_override: bool,
strip_bom: bool,
bom_sniffing: bool,
}

impl Default for DecodeReaderBytesBuilder {
Expand All @@ -127,6 +128,7 @@ impl DecodeReaderBytesBuilder {
utf8_passthru: false,
bom_override: false,
strip_bom: false,
bom_sniffing: true,
}
}

Expand Down Expand Up @@ -161,8 +163,12 @@ impl DecodeReaderBytesBuilder {
}
let encoding = self.encoding
.map(|enc| enc.new_decoder_with_bom_removal());
// No need to do BOM detection if we have an explicit encoding.
let has_detected = !self.bom_override && encoding.is_some();

// No need to do BOM detection if we opt out of it or have an explicit
// encoding.
let has_detected =
!self.bom_sniffing || (!self.bom_override && encoding.is_some());

let peeker =
if self.utf8_passthru && self.strip_bom {
// We only need to do this when utf8_passthru is enabled
Expand Down Expand Up @@ -312,6 +318,24 @@ impl DecodeReaderBytesBuilder {
self.bom_override = yes;
self
}

/// Enable BOM sniffing
///
/// When this is enabled and an explicit encoding is not set, the decoder
/// will try to detect the encoding with BOM.
///
/// When this is disabled and an explicit encoding is not set, the decoder
/// will treat the input as raw bytes. The bytes will be passed through
/// unchanged, including any BOM that may be present.
///
/// This is enabled by default.
pub fn bom_sniffing(
&mut self,
yes: bool,
) -> &mut DecodeReaderBytesBuilder {
self.bom_sniffing = yes;
self
}
}

/// An implementation of `io::Read` that transcodes to UTF-8 in a streaming
Expand Down Expand Up @@ -739,6 +763,33 @@ mod tests {
assert_eq!(got, b"abcdefgh");
}

#[test]
fn trans_utf16_no_sniffing() {
let srcbuf = vec![
0xFF, 0xFE,
0x61, 0x00,
];
let rdr = DecodeReaderBytesBuilder::new()
.bom_sniffing(false)
.build(&*srcbuf);
let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
assert_eq!(got, srcbuf);
}

#[test]
fn trans_utf16_no_sniffing_encoding_override() {
let srcbuf = vec![
0xFF, 0xFE,
0x61, 0x00,
];
let rdr = DecodeReaderBytesBuilder::new()
.bom_sniffing(false)
.encoding(Some(encoding_rs::UTF_16LE))
.build(&*srcbuf);
let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
assert_eq!(got, b"a");
}

// Test transcoding with a minimal buffer using byte oriented APIs.
#[test]
fn trans_utf16_minimal_buffer_byte_api() {
Expand Down

0 comments on commit b840b09

Please sign in to comment.