-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow writing of incomplete UTF-8 sequences to the Windows console via stdout/stderr #83342
Changes from all commits
27393d5
a941e68
60b149f
0202273
d114694
52713a4
fb1fa97
34cfe38
3103f5f
7cfbe54
dd3b79e
fbfde7e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,14 +8,25 @@ use crate::str; | |
use crate::sys::c; | ||
use crate::sys::cvt; | ||
use crate::sys::handle::Handle; | ||
use core::str::utf8_char_width; | ||
|
||
// Don't cache handles but get them fresh for every read/write. This allows us to track changes to | ||
// the value over time (such as if a process calls `SetStdHandle` while it's running). See #40490. | ||
pub struct Stdin { | ||
surrogate: u16, | ||
} | ||
pub struct Stdout; | ||
pub struct Stderr; | ||
pub struct Stdout { | ||
incomplete_utf8: IncompleteUtf8, | ||
} | ||
|
||
pub struct Stderr { | ||
incomplete_utf8: IncompleteUtf8, | ||
} | ||
|
||
struct IncompleteUtf8 { | ||
bytes: [u8; 4], | ||
len: u8, | ||
} | ||
|
||
// Apparently Windows doesn't handle large reads on stdin or writes to stdout/stderr well (see | ||
// #13304 for details). | ||
|
@@ -50,7 +61,15 @@ fn is_console(handle: c::HANDLE) -> bool { | |
unsafe { c::GetConsoleMode(handle, &mut mode) != 0 } | ||
} | ||
|
||
fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result<usize> { | ||
fn write( | ||
handle_id: c::DWORD, | ||
data: &[u8], | ||
incomplete_utf8: &mut IncompleteUtf8, | ||
) -> io::Result<usize> { | ||
if data.is_empty() { | ||
return Ok(0); | ||
} | ||
|
||
let handle = get_handle(handle_id)?; | ||
if !is_console(handle) { | ||
let handle = Handle::new(handle); | ||
|
@@ -59,22 +78,73 @@ fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result<usize> { | |
return ret; | ||
} | ||
|
||
// As the console is meant for presenting text, we assume bytes of `data` come from a string | ||
// and are encoded as UTF-8, which needs to be encoded as UTF-16. | ||
if incomplete_utf8.len > 0 { | ||
assert!( | ||
incomplete_utf8.len < 4, | ||
"Unexpected number of bytes for incomplete UTF-8 codepoint." | ||
); | ||
if data[0] >> 6 != 0b10 { | ||
// not a continuation byte - reject | ||
incomplete_utf8.len = 0; | ||
return Err(io::Error::new_const( | ||
io::ErrorKind::InvalidData, | ||
&"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", | ||
)); | ||
} | ||
incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0]; | ||
incomplete_utf8.len += 1; | ||
let char_width = utf8_char_width(incomplete_utf8.bytes[0]); | ||
if (incomplete_utf8.len as usize) < char_width { | ||
// more bytes needed | ||
return Ok(1); | ||
} | ||
let s = str::from_utf8(&incomplete_utf8.bytes[0..incomplete_utf8.len as usize]); | ||
incomplete_utf8.len = 0; | ||
match s { | ||
Ok(s) => { | ||
assert_eq!(char_width, s.len()); | ||
let written = write_valid_utf8_to_console(handle, s)?; | ||
assert_eq!(written, s.len()); // guaranteed by write_valid_utf8_to_console() for single codepoint writes | ||
return Ok(1); | ||
} | ||
Err(_) => { | ||
return Err(io::Error::new_const( | ||
io::ErrorKind::InvalidData, | ||
&"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", | ||
)); | ||
} | ||
} | ||
} | ||
|
||
// As the console is meant for presenting text, we assume bytes of `data` are encoded as UTF-8, | ||
// which needs to be encoded as UTF-16. | ||
// | ||
// If the data is not valid UTF-8 we write out as many bytes as are valid. | ||
// Only when there are no valid bytes (which will happen on the next call), return an error. | ||
// If the first byte is invalid it is either first byte of a multi-byte sequence but the | ||
// provided byte slice is too short or it is the first byte of an invalide multi-byte sequence. | ||
let len = cmp::min(data.len(), MAX_BUFFER_SIZE / 2); | ||
let utf8 = match str::from_utf8(&data[..len]) { | ||
Ok(s) => s, | ||
Err(ref e) if e.valid_up_to() == 0 => { | ||
return Err(io::Error::new_const( | ||
io::ErrorKind::InvalidData, | ||
&"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", | ||
)); | ||
let first_byte_char_width = utf8_char_width(data[0]); | ||
if first_byte_char_width > 1 && data.len() < first_byte_char_width { | ||
incomplete_utf8.bytes[0] = data[0]; | ||
incomplete_utf8.len = 1; | ||
return Ok(1); | ||
} else { | ||
return Err(io::Error::new_const( | ||
io::ErrorKind::InvalidData, | ||
&"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", | ||
)); | ||
} | ||
} | ||
Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(), | ||
}; | ||
|
||
write_valid_utf8_to_console(handle, utf8) | ||
} | ||
|
||
fn write_valid_utf8_to_console(handle: c::HANDLE, utf8: &str) -> io::Result<usize> { | ||
let mut utf16 = [0u16; MAX_BUFFER_SIZE / 2]; | ||
let mut len_utf16 = 0; | ||
for (chr, dest) in utf8.encode_utf16().zip(utf16.iter_mut()) { | ||
|
@@ -254,15 +324,21 @@ fn utf16_to_utf8(utf16: &[u16], utf8: &mut [u8]) -> io::Result<usize> { | |
Ok(written) | ||
} | ||
|
||
impl IncompleteUtf8 { | ||
pub const fn new() -> IncompleteUtf8 { | ||
IncompleteUtf8 { bytes: [0; 4], len: 0 } | ||
} | ||
} | ||
|
||
impl Stdout { | ||
pub const fn new() -> Stdout { | ||
Stdout | ||
Stdout { incomplete_utf8: IncompleteUtf8::new() } | ||
} | ||
} | ||
|
||
impl io::Write for Stdout { | ||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> { | ||
write(c::STD_OUTPUT_HANDLE, buf) | ||
write(c::STD_OUTPUT_HANDLE, buf, &mut self.incomplete_utf8) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An incomplete write will leave data in buffer even after There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This works similar to how the console output works on Macos and Linux, only that there the terminal itself does the buffering. E.g. this correctly prints "hello worldö" on the terminal on Linux and Macos: use std::io::{self, Write};
fn main() -> io::Result<()> {
io::stdout().write_all(b"hello world\xc3")?;
io::stdout().flush()?;
io::stdout().write_all(b"\xb6")?;
Ok(())
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The difference is that once you flush there's no data buffered in the program anymore. In general, the semantics of buffering incomplete UTF-8 seem reasonable to me. I think we'll need to talk about how this interacts with flush, though; I'm hesitant to hold on to buffered data of any kind after a flush. Possible semantics:
I don't want to do (3); that seems wrong. We shouldn't hide errors. Either (1) or (2) seems valid, but which one is more useful behavior? I suspect we want (1). |
||
} | ||
|
||
fn flush(&mut self) -> io::Result<()> { | ||
|
@@ -272,13 +348,13 @@ impl io::Write for Stdout { | |
|
||
impl Stderr { | ||
pub const fn new() -> Stderr { | ||
Stderr | ||
Stderr { incomplete_utf8: IncompleteUtf8::new() } | ||
} | ||
} | ||
|
||
impl io::Write for Stderr { | ||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> { | ||
write(c::STD_ERROR_HANDLE, buf) | ||
write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8) | ||
} | ||
|
||
fn flush(&mut self) -> io::Result<()> { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wouldn't this mean that an incomplete write followed by a complete write will cause the complete write to trigger error, and wouldn't be outputted at all?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You mean trying to write invalid UTF-8 to the console? This returns an error right now as well on Windows.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, if I do an incomplete write followed a complete write, currently the error is reported at the incomplete write; with the change, the error will be reported at the complete write, which might be confusing.