diff --git a/src/uu/paste/src/paste.rs b/src/uu/paste/src/paste.rs index 3d4cf733ca..9d26197813 100644 --- a/src/uu/paste/src/paste.rs +++ b/src/uu/paste/src/paste.rs @@ -3,13 +3,14 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (ToDO) delim - use clap::{crate_version, Arg, ArgAction, Command}; +use std::cell::{OnceCell, RefCell}; use std::fs::File; -use std::io::{stdin, stdout, BufRead, BufReader, Read, Write}; -use std::path::Path; -use uucore::error::{FromIo, UResult, USimpleError}; +use std::io::{stdin, stdout, BufRead, BufReader, Stdin, Write}; +use std::iter::Cycle; +use std::rc::Rc; +use std::slice::Iter; +use uucore::error::{UResult, USimpleError}; use uucore::line_ending::LineEnding; use uucore::{format_usage, help_about, help_usage}; @@ -23,18 +24,6 @@ mod options { pub const ZERO_TERMINATED: &str = "zero-terminated"; } -// Wraps BufReader and stdin -fn read_until( - reader: Option<&mut BufReader>, - byte: u8, - buf: &mut Vec, -) -> std::io::Result { - match reader { - Some(reader) => reader.read_until(byte, buf), - None => stdin().lock().read_until(byte, buf), - } -} - #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().try_get_matches_from(args)?; @@ -96,120 +85,292 @@ fn paste( delimiters: &str, line_ending: LineEnding, ) -> UResult<()> { - let mut files = Vec::with_capacity(filenames.len()); - for name in filenames { - let file = if name == "-" { - None - } else { - let path = Path::new(&name); - let r = File::open(path).map_err_context(String::new)?; - Some(BufReader::new(r)) + let unescaped_and_encoded_delimiters = parse_delimiters(delimiters)?; + + let stdin_once_cell = OnceCell::>>::new(); + + let mut input_source_vec = Vec::with_capacity(filenames.len()); + + for filename in filenames { + let input_source = match filename.as_str() { + "-" => InputSource::StandardInput( + stdin_once_cell + .get_or_init(|| Rc::new(RefCell::new(stdin()))) + .clone(), + ), + st => { + let file = File::open(st)?; + + InputSource::File(BufReader::new(file)) + } }; - files.push(file); - } - if delimiters.ends_with('\\') && !delimiters.ends_with("\\\\") { - return Err(USimpleError::new( - 1, - format!("delimiter list ends with an unescaped backslash: {delimiters}"), - )); + input_source_vec.push(input_source); } - let delimiters: Vec = unescape(delimiters).chars().collect(); - let mut delim_count = 0; - let mut delim_length = 1; - let stdout = stdout(); - let mut stdout = stdout.lock(); + let mut stdout = stdout().lock(); + + let line_ending_byte = u8::from(line_ending); + let line_ending_byte_array_ref = &[line_ending_byte]; + + let input_source_vec_len = input_source_vec.len(); + + let mut delimiter_state = DelimiterState::new(&unescaped_and_encoded_delimiters); let mut output = Vec::new(); + if serial { - for file in &mut files { + for input_source in &mut input_source_vec { output.clear(); + loop { - match read_until(file.as_mut(), line_ending as u8, &mut output) { - Ok(0) => break, - Ok(_) => { - if output.ends_with(&[line_ending as u8]) { - output.pop(); - } - // a buffer of length four is large enough to encode any char - let mut buffer = [0; 4]; - let ch = - delimiters[delim_count % delimiters.len()].encode_utf8(&mut buffer); - delim_length = ch.len(); - - for byte in buffer.iter().take(delim_length) { - output.push(*byte); - } + match input_source.read_until(line_ending_byte, &mut output)? { + 0 => break, + _ => { + remove_trailing_line_ending_byte(line_ending_byte, &mut output); + + delimiter_state.write_delimiter(&mut output); } - Err(e) => return Err(e.map_err_context(String::new)), } - delim_count += 1; } - // remove final delimiter - output.truncate(output.len() - delim_length); - - write!( - stdout, - "{}{}", - String::from_utf8_lossy(&output), - line_ending - )?; + + delimiter_state.remove_trailing_delimiter(&mut output); + + stdout.write_all(&output)?; + stdout.write_all(line_ending_byte_array_ref)?; } } else { - let mut eof = vec![false; files.len()]; + let mut eof = vec![false; input_source_vec_len]; + loop { output.clear(); + let mut eof_count = 0; - for (i, file) in files.iter_mut().enumerate() { + + for (i, input_source) in input_source_vec.iter_mut().enumerate() { if eof[i] { eof_count += 1; } else { - match read_until(file.as_mut(), line_ending as u8, &mut output) { - Ok(0) => { + match input_source.read_until(line_ending_byte, &mut output)? { + 0 => { eof[i] = true; eof_count += 1; } - Ok(_) => { - if output.ends_with(&[line_ending as u8]) { - output.pop(); - } + _ => { + remove_trailing_line_ending_byte(line_ending_byte, &mut output); } - Err(e) => return Err(e.map_err_context(String::new)), } } - // a buffer of length four is large enough to encode any char - let mut buffer = [0; 4]; - let ch = delimiters[delim_count % delimiters.len()].encode_utf8(&mut buffer); - delim_length = ch.len(); - - for byte in buffer.iter().take(delim_length) { - output.push(*byte); - } - delim_count += 1; + delimiter_state.write_delimiter(&mut output); } - if files.len() == eof_count { + + if eof_count == input_source_vec_len { break; } - // Remove final delimiter - output.truncate(output.len() - delim_length); - - write!( - stdout, - "{}{}", - String::from_utf8_lossy(&output), - line_ending - )?; - delim_count = 0; + + delimiter_state.remove_trailing_delimiter(&mut output); + + stdout.write_all(&output)?; + stdout.write_all(line_ending_byte_array_ref)?; + + // Quote: + // When the -s option is not specified: + // [...] + // The delimiter shall be reset to the first element of list after each file operand is processed. + // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html + delimiter_state.reset_to_first_delimiter(); } } + Ok(()) } -// Unescape all special characters -fn unescape(s: &str) -> String { - s.replace("\\n", "\n") - .replace("\\t", "\t") - .replace("\\\\", "\\") +fn parse_delimiters(delimiters: &str) -> UResult]>> { + /// A single backslash char + const BACKSLASH: char = '\\'; + + fn add_one_byte_single_char_delimiter(vec: &mut Vec>, byte: u8) { + vec.push(Box::new([byte])); + } + + // a buffer of length four is large enough to encode any char + let mut buffer = [0; 4]; + + let mut add_single_char_delimiter = |vec: &mut Vec>, ch: char| { + let delimiter_encoded = ch.encode_utf8(&mut buffer); + + vec.push(Box::from(delimiter_encoded.as_bytes())); + }; + + let mut vec = Vec::>::with_capacity(delimiters.len()); + + let mut chars = delimiters.chars(); + + // Unescape all special characters + while let Some(char) = chars.next() { + match char { + BACKSLASH => match chars.next() { + // "Empty string (not a null character)" + // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html + Some('0') => { + vec.push(Box::<[u8; 0]>::new([])); + } + // "\\" to "\" (U+005C) + Some(BACKSLASH) => { + add_one_byte_single_char_delimiter(&mut vec, b'\\'); + } + // "\n" to U+000A + Some('n') => { + add_one_byte_single_char_delimiter(&mut vec, b'\n'); + } + // "\t" to U+0009 + Some('t') => { + add_one_byte_single_char_delimiter(&mut vec, b'\t'); + } + Some(other_char) => { + // "If any other characters follow the , the results are unspecified." + // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html + // However, other implementations remove the backslash + // See "test_posix_unspecified_delimiter" + add_single_char_delimiter(&mut vec, other_char); + } + None => { + return Err(USimpleError::new( + 1, + format!("delimiter list ends with an unescaped backslash: {delimiters}"), + )); + } + }, + non_backslash_char => { + add_single_char_delimiter(&mut vec, non_backslash_char); + } + } + } + + Ok(vec.into_boxed_slice()) +} + +fn remove_trailing_line_ending_byte(line_ending_byte: u8, output: &mut Vec) { + if let Some(&byte) = output.last() { + if byte == line_ending_byte { + assert!(output.pop() == Some(line_ending_byte)); + } + } +} + +enum DelimiterState<'a> { + NoDelimiters, + OneDelimiter(&'a [u8]), + MultipleDelimiters { + current_delimiter: &'a [u8], + delimiters: &'a [Box<[u8]>], + delimiters_iterator: Cycle>>, + }, +} + +impl<'a> DelimiterState<'a> { + fn new(unescaped_and_encoded_delimiters: &'a [Box<[u8]>]) -> DelimiterState<'a> { + match unescaped_and_encoded_delimiters { + [] => DelimiterState::NoDelimiters, + [only_delimiter] => { + // -d '\0' is equivalent to -d '' + if only_delimiter.is_empty() { + DelimiterState::NoDelimiters + } else { + DelimiterState::OneDelimiter(only_delimiter) + } + } + [first_delimiter, ..] => DelimiterState::MultipleDelimiters { + current_delimiter: first_delimiter, + delimiters: unescaped_and_encoded_delimiters, + delimiters_iterator: unescaped_and_encoded_delimiters.iter().cycle(), + }, + } + } + + /// This should only be used to return to the start of the delimiter list after a file has been processed. + /// This should only be used when the "serial" option is disabled. + /// This is a no-op unless there are multiple delimiters. + fn reset_to_first_delimiter(&mut self) { + if let DelimiterState::MultipleDelimiters { + delimiters_iterator, + delimiters, + .. + } = self + { + *delimiters_iterator = delimiters.iter().cycle(); + } + } + + /// Remove the trailing delimiter. + /// If there are no delimiters, this is a no-op. + fn remove_trailing_delimiter(&mut self, output: &mut Vec) { + let delimiter_length = match self { + DelimiterState::OneDelimiter(only_delimiter) => only_delimiter.len(), + DelimiterState::MultipleDelimiters { + current_delimiter, .. + } => current_delimiter.len(), + _ => { + return; + } + }; + + // `delimiter_length` will be zero if the current delimiter is a "\0" delimiter + if delimiter_length > 0 { + let output_len = output.len(); + + if let Some(output_without_delimiter_length) = output_len.checked_sub(delimiter_length) + { + output.truncate(output_without_delimiter_length); + } else { + // This branch is NOT unreachable, must be skipped + // `output` should be empty in this case + assert!(output_len == 0); + } + } + } + + /// Append the current delimiter to `output`. + /// If there are no delimiters, this is a no-op. + fn write_delimiter(&mut self, output: &mut Vec) { + match self { + DelimiterState::OneDelimiter(only_delimiter) => { + output.extend_from_slice(only_delimiter); + } + DelimiterState::MultipleDelimiters { + current_delimiter, + delimiters_iterator, + .. + } => { + // Unwrap because `delimiters_iterator` is a cycle iter and was created from a non-empty slice + let bo = delimiters_iterator.next().unwrap(); + + output.extend_from_slice(bo); + + *current_delimiter = bo; + } + _ => {} + } + } +} + +enum InputSource { + File(BufReader), + StandardInput(Rc>), +} + +impl InputSource { + fn read_until(&mut self, byte: u8, buf: &mut Vec) -> UResult { + let us = match self { + Self::File(bu) => bu.read_until(byte, buf)?, + Self::StandardInput(rc) => rc + .try_borrow() + .map_err(|bo| USimpleError::new(1, format!("{bo}")))? + .lock() + .read_until(byte, buf)?, + }; + + Ok(us) + } } diff --git a/tests/by-util/test_paste.rs b/tests/by-util/test_paste.rs index e770262c2a..75fc938951 100644 --- a/tests/by-util/test_paste.rs +++ b/tests/by-util/test_paste.rs @@ -2,6 +2,9 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. + +// spell-checker:ignore bsdutils toybox + use crate::common::util::TestScenario; struct TestData<'b> { @@ -11,7 +14,7 @@ struct TestData<'b> { out: &'b str, } -static EXAMPLE_DATA: &[TestData] = &[ +const EXAMPLE_DATA: &[TestData] = &[ // Ensure that paste properly handles files lacking a final newline. TestData { name: "no-nl-1", @@ -172,7 +175,7 @@ fn test_delimiter_list_ending_with_escaped_backslash() { at.write(&file, one_in); ins.push(file); } - ucmd.args(&[d, "\\\\"]) + ucmd.args(&[d, r"\\"]) .args(&ins) .succeeds() .stdout_is("a\\b\n"); @@ -183,13 +186,174 @@ fn test_delimiter_list_ending_with_escaped_backslash() { fn test_delimiter_list_ending_with_unescaped_backslash() { for d in ["-d", "--delimiters"] { new_ucmd!() - .args(&[d, "\\"]) + .args(&[d, r"\"]) .fails() - .stderr_contains("delimiter list ends with an unescaped backslash: \\"); + .stderr_contains(r"delimiter list ends with an unescaped backslash: \"); + + new_ucmd!() + .args(&[d, r"\\\"]) + .fails() + .stderr_contains(r"delimiter list ends with an unescaped backslash: \\\"); + + new_ucmd!() + .args(&[d, r"_\"]) + .fails() + .stderr_contains(r"delimiter list ends with an unescaped backslash: _\"); + } +} + +#[test] +fn test_delimiter_list_empty() { + for option_style in ["-d", "--delimiters"] { + new_ucmd!() + .args(&[option_style, "", "-s"]) + .pipe_in( + "\ +A ALPHA 1 _ +B BRAVO 2 _ +C CHARLIE 3 _ +", + ) + .succeeds() + .stdout_only( + "\ +A ALPHA 1 _B BRAVO 2 _C CHARLIE 3 _ +", + ); + } +} + +// Was panicking (usize subtraction that would have resulted in a negative number) +// Not observable in release builds, since integer overflow checking is not enabled +#[test] +fn test_delimiter_truncation() { + for option_style in ["-d", "--delimiters"] { + new_ucmd!() + .args(&[option_style, "!@#", "-s", "-", "-", "-"]) + .pipe_in( + "\ +FIRST +SECOND +THIRD +FOURTH +ABCDEFG +", + ) + .succeeds() + .stdout_only( + "\ +FIRST!SECOND@THIRD#FOURTH!ABCDEFG + + +", + ); + } +} + +#[test] +fn test_non_utf8_input() { + // 0xC0 is not valid UTF-8 + const INPUT: &[u8] = b"Non-UTF-8 test: \xC0\x00\xC0.\n"; + + new_ucmd!() + .pipe_in(INPUT) + .succeeds() + .stdout_only_bytes(INPUT); +} + +#[test] +fn test_three_trailing_backslashes_delimiter() { + const ONE_BACKSLASH_STR: &str = r"\"; + + let three_backslashes_string = ONE_BACKSLASH_STR.repeat(3); + + for option_style in ["-d", "--delimiters"] { new_ucmd!() - .args(&[d, "_\\"]) + .args(&[option_style, &three_backslashes_string]) .fails() - .stderr_contains("delimiter list ends with an unescaped backslash: _\\"); + .no_stdout() + .stderr_str_check(|st| { + st.ends_with(&format!( + ": delimiter list ends with an unescaped backslash: {three_backslashes_string}\n" + )) + }); + } +} + +// "If any other characters follow the , the results are unspecified." +// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html +// However, other implementations remove the backslash +#[test] +fn test_posix_unspecified_delimiter() { + for option_style in ["-d", "--delimiters"] { + new_ucmd!() + .args(&[option_style, r"\z", "-s"]) + .pipe_in( + "\ +1 +2 +3 +4 +", + ) + .succeeds() + .stdout_only( + "\ +1z2z3z4 +", + ); + } +} + +// "Empty string (not a null character)" +// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html +#[test] +fn test_backslash_zero_delimiter() { + for option_style in ["-d", "--delimiters"] { + new_ucmd!() + .args(&[option_style, r"\0z\0", "-s"]) + .pipe_in( + "\ +1 +2 +3 +4 +5 +6 +", + ) + .succeeds() + .stdout_only( + "\ +12z345z6 +", + ); + } +} + +// As of 2024-10-09, only bsdutils (https://github.com/dcantrell/bsdutils, derived from FreeBSD) and toybox handle +// multibyte delimiter characters in the way a user would likely expect. BusyBox and GNU Core Utilities do not. +#[test] +fn test_multi_byte_delimiter() { + for option_style in ["-d", "--delimiters"] { + new_ucmd!() + .args(&[option_style, "!ß@", "-s"]) + .pipe_in( + "\ +1 +2 +3 +4 +5 +6 +", + ) + .succeeds() + .stdout_only( + "\ +1!2ß3@4!5ß6 +", + ); } }