Skip to content

Commit

Permalink
Delay processing of incomplete UTF-8 sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
bugadani authored and jessebraham committed Sep 12, 2023
1 parent 129439a commit 5b3dee0
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

- Fixed printing UTF-8 sequences that were read in multiple parts. (#468)

### Changed

### Removed
Expand Down
110 changes: 108 additions & 2 deletions espflash/src/cli/monitor/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ struct SerialContext<'ctx> {
symbols: Option<Symbols<'ctx>>,
previous_frag: Option<String>,
previous_line: Option<String>,
incomplete_utf8_buffer: Vec<u8>,
}

impl<'ctx> SerialContext<'ctx> {
Expand All @@ -52,6 +53,40 @@ impl<'ctx> SerialContext<'ctx> {
..Self::default()
}
}

fn process_utf8(&mut self, buff: &[u8]) -> String {
let mut buffer = std::mem::take(&mut self.incomplete_utf8_buffer);
buffer.extend(normalized(buff.iter().copied()));

// look for longest slice that we can then lossily convert without introducing errors for
// partial sequences (#457)
let mut len = 0;

loop {
match std::str::from_utf8(&buffer[len..]) {
// whole input is valid
Ok(str) if len == 0 => return String::from(str),

// input is valid after the last error, and we could ignore the last error, so
// let's process the whole input
Ok(_) => return String::from_utf8_lossy(&buffer).to_string(),

// input has some errors. We can ignore invalid sequences and replace them later,
// but we have to stop if we encounter an incomplete sequence.
Err(e) => {
len += e.valid_up_to();
if let Some(error_len) = e.error_len() {
len += error_len;
} else {
// incomplete sequence. We split it off, save it for later
let (bytes, incomplete) = buffer.split_at(len);
self.incomplete_utf8_buffer = incomplete.to_vec();
return String::from_utf8_lossy(bytes).to_string();
}
}
}
}
}
}

/// Type that ensures that raw mode is disabled when dropped.
Expand Down Expand Up @@ -144,8 +179,7 @@ pub fn monitor(

/// Handles and writes the received serial data to the given output stream.
fn handle_serial(ctx: &mut SerialContext, buff: &[u8], out: &mut dyn Write) {
let text: Vec<u8> = normalized(buff.iter().copied()).collect();
let text = String::from_utf8_lossy(&text).to_string();
let text = ctx.process_utf8(buff);

// Split the text into lines, storing the last of which separately if it is
// incomplete (ie. does not end with '\n') because these need special handling.
Expand Down Expand Up @@ -278,3 +312,75 @@ fn handle_key_event(key_event: KeyEvent) -> Option<Vec<u8>> {

key_str.map(|slice| slice.into())
}

#[cfg(test)]
mod test {
#[test]
fn returns_valid_strings_immediately() {
let mut ctx = super::SerialContext::default();
let buff = b"Hello, world!";
let text = ctx.process_utf8(buff);
assert_eq!(text, "Hello, world!");
}

#[test]
fn does_not_repeat_valid_strings() {
let mut ctx = super::SerialContext::default();
let text = ctx.process_utf8(b"Hello, world!");
assert_eq!(text, "Hello, world!");
let text = ctx.process_utf8(b"Something else");
assert_eq!(text, "Something else");
}

#[test]
fn replaces_invalid_sequence() {
let mut ctx = super::SerialContext::default();
let text = ctx.process_utf8(b"Hello, \xFF world!");
assert_eq!(text, "Hello, \u{FFFD} world!");
}

#[test]
fn can_replace_unfinished_incomplete_sequence() {
let mut ctx = super::SerialContext::default();
let mut incomplete = Vec::from("Hello, ".as_bytes());
let utf8 = "🙈".as_bytes();
incomplete.extend_from_slice(&utf8[..utf8.len() - 1]);
let text = ctx.process_utf8(&incomplete);
assert_eq!(text, "Hello, ");

let text = ctx.process_utf8(b" world!");
assert_eq!(text, "\u{FFFD} world!");
}

#[test]
fn can_merge_incomplete_sequence() {
let mut ctx = super::SerialContext::default();
let mut incomplete = Vec::from("Hello, ".as_bytes());
let utf8 = "🙈".as_bytes();
incomplete.extend_from_slice(&utf8[..utf8.len() - 1]);

let text = ctx.process_utf8(&incomplete);
assert_eq!(text, "Hello, ");

let text = ctx.process_utf8(&utf8[utf8.len() - 1..]);
assert_eq!(text, "🙈");
}

#[test]
fn issue_457() {
let mut ctx = super::SerialContext::default();
let mut result = String::new();

result.push_str(&ctx.process_utf8(&[0x48]));
result.push_str(&ctx.process_utf8(&[0x65, 0x6C, 0x6C]));
result.push_str(&ctx.process_utf8(&[
0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, 0x20, 0x77, 0x69, 0x74,
]));
result.push_str(&ctx.process_utf8(&[
0x68, 0x20, 0x55, 0x54, 0x46, 0x3A, 0x20, 0x77, 0x79, 0x73, 0x79,
]));
result.push_str(&ctx.process_utf8(&[0xC5, 0x82, 0x61, 0x6D, 0x0A]));

assert_eq!(result, "Hello world! with UTF: wysyłam\r\n");
}
}

0 comments on commit 5b3dee0

Please sign in to comment.