From 846cf06272d8862a8e3165494ca3c71244572da1 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Fri, 20 Sep 2024 14:34:18 -0500 Subject: [PATCH 01/17] basenc: perform faster, streaming encoding Improve the performance, both in memory and time, of the encoding performed by the basenc (except in --z85 mode), base32, and base64 programs. These programs now perform encoding in a buffered/streaming manner, so encoding is not constrained by the amount of available memory. --- src/uu/base32/src/base_common.rs | 376 ++++++++++++++++++++++-- src/uucore/src/lib/features/encoding.rs | 64 ++-- tests/by-util/test_base64.rs | 33 +++ tests/by-util/test_basenc.rs | 4 +- 4 files changed, 415 insertions(+), 62 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index 897722dd36e..f838bac09c1 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -3,22 +3,31 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use std::io::{stdout, Read, Write}; - -use uucore::display::Quotable; -use uucore::encoding::{wrap_print, Data, EncodeError, Format}; -use uucore::error::{FromIo, UResult, USimpleError, UUsageError}; -use uucore::format_usage; +// spell-checker:ignore HEXUPPER Lsbf Msbf +use clap::{crate_version, Arg, ArgAction, Command}; use std::fs::File; +use std::io::{stdout, Read, Write}; use std::io::{BufReader, Stdin}; use std::path::Path; +use uucore::display::Quotable; +use uucore::encoding::{ + for_fast_encode::{BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER}, + wrap_print, Data, EncodeError, Format, +}; +use uucore::encoding::{BASE2LSBF, BASE2MSBF}; +use uucore::error::{FromIo, UResult, USimpleError, UUsageError}; +use uucore::format_usage; -use clap::{crate_version, Arg, ArgAction, Command}; +pub const BASE_CMD_PARSE_ERROR: i32 = 1_i32; -pub static BASE_CMD_PARSE_ERROR: i32 = 1; +/// Encoded output will be formatted in lines of this length (the last line can be shorter) +/// +/// Other implementations default to 76 +/// +/// This default is only used if no "-w"/"--wrap" argument is passed +const WRAP_DEFAULT: usize = 76_usize; -// Config. pub struct Config { pub decode: bool, pub ignore_garbage: bool, @@ -118,7 +127,7 @@ pub fn base_app(about: &'static str, usage: &str) -> Command { .short('w') .long(options::WRAP) .value_name("COLS") - .help("wrap encoded lines after COLS character (default 76, 0 to disable wrapping)") + .help(format!("wrap encoded lines after COLS character (default {WRAP_DEFAULT}, 0 to disable wrapping)")) .overrides_with(options::WRAP), ) // "multiple" arguments are used to check whether there is more than one @@ -147,17 +156,43 @@ pub fn get_input<'a>(config: &Config, stdin_ref: &'a Stdin) -> UResult( input: &mut R, format: Format, - line_wrap: Option, + wrap: Option, ignore_garbage: bool, decode: bool, ) -> UResult<()> { - let mut data = Data::new(input, format).ignore_garbage(ignore_garbage); - if let Some(wrap) = line_wrap { - data = data.line_wrap(wrap); - } + const ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024_usize; + + // These constants indicate that inputs with lengths divisible by these numbers will have no padding characters + // after encoding. + // For instance: + // "The quick brown" + // is 15 characters (divisible by 3), so it is encoded in Base64 without padding: + // "VGhlIHF1aWNrIGJyb3du" + // While: + // "The quick brown fox" + // is 19 characters, which is not divisible by 3, so its Base64 representation has padding: + // "VGhlIHF1aWNrIGJyb3duIGZveA==" + // The encoding logic in this function depends on these constants being correct, so do not modify + // them. Performance can be tuned by multiplying these numbers by a different multiple (see + // `ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE` above). + const BASE16_UN_PADDED_MULTIPLE: usize = 1_usize; + const BASE2_UN_PADDED_MULTIPLE: usize = 1_usize; + const BASE32_UN_PADDED_MULTIPLE: usize = 5_usize; + const BASE64_UN_PADDED_MULTIPLE: usize = 3_usize; + + const BASE16_ENCODE_IN_CHUNKS_OF_SIZE: usize = + BASE16_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + const BASE2_ENCODE_IN_CHUNKS_OF_SIZE: usize = + BASE2_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + const BASE32_ENCODE_IN_CHUNKS_OF_SIZE: usize = + BASE32_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + const BASE64_ENCODE_IN_CHUNKS_OF_SIZE: usize = + BASE64_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; if decode { - match data.decode() { + let mut data = Data::new(input, format); + + match data.decode(ignore_garbage) { Ok(s) => { // Silent the warning as we want to the error message #[allow(clippy::question_mark)] @@ -170,16 +205,307 @@ pub fn handle_input( Err(_) => Err(USimpleError::new(1, "error: invalid input")), } } else { - match data.encode() { - Ok(s) => { - wrap_print(&data, &s); - Ok(()) + #[allow(clippy::identity_op)] + let encoding_and_encode_in_chunks_of_size = match format { + // Use naive approach for Z85, since the crate being used doesn't have the API needed + Format::Z85 => { + let mut data = Data::new(input, format); + + let result = match data.encode() { + Ok(st) => { + wrap_print(&st, wrap.unwrap_or(WRAP_DEFAULT))?; + + Ok(()) + } + Err(EncodeError::InvalidInput) => { + Err(USimpleError::new(1, "error: invalid input")) + } + Err(_) => Err(USimpleError::new( + 1, + "error: invalid input (length must be multiple of 4 characters)", + )), + }; + + return result; } - Err(EncodeError::InvalidInput) => Err(USimpleError::new(1, "error: invalid input")), - Err(_) => Err(USimpleError::new( - 1, - "error: invalid input (length must be multiple of 4 characters)", - )), + + // For these, use faster, new encoding logic + Format::Base16 => (&HEXUPPER, BASE16_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base2Lsbf => (&BASE2LSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base2Msbf => (&BASE2MSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base32 => (&BASE32, BASE32_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base32Hex => (&BASE32HEX, BASE32_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base64 => (&BASE64, BASE64_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base64Url => (&BASE64URL, BASE64_ENCODE_IN_CHUNKS_OF_SIZE), + }; + + fast_encode::fast_encode(input, encoding_and_encode_in_chunks_of_size, wrap)?; + + Ok(()) + } +} + +mod fast_encode { + use crate::base_common::WRAP_DEFAULT; + use std::{ + collections::VecDeque, + io::{self, ErrorKind, Read, StdoutLock, Write}, + }; + use uucore::{ + encoding::for_fast_encode::Encoding, + error::{UResult, USimpleError}, + }; + + struct LineWrapping { + line_length: usize, + print_buffer: Vec, + } + + // Start of helper functions + // Adapted from `encode_append` in the "data-encoding" crate + fn encode_append_vec_deque(encoding: &Encoding, input: &[u8], output: &mut VecDeque) { + let output_len = output.len(); + + output.resize(output_len + encoding.encode_len(input.len()), 0_u8); + + let make_contiguous_result = output.make_contiguous(); + + encoding.encode_mut(input, &mut (make_contiguous_result[output_len..])); + } + + fn write_without_line_breaks( + encoded_buffer: &mut VecDeque, + stdout_lock: &mut StdoutLock, + is_cleanup: bool, + ) -> io::Result<()> { + // TODO + // `encoded_buffer` only has to be a VecDeque if line wrapping is enabled + // (`make_contiguous` should be a no-op here) + // Refactoring could avoid this call + stdout_lock.write_all(encoded_buffer.make_contiguous())?; + + if is_cleanup { + stdout_lock.write_all(b"\n")?; + } else { + encoded_buffer.truncate(0_usize); } + + Ok(()) + } + + fn write_with_line_breaks( + &mut LineWrapping { + ref line_length, + ref mut print_buffer, + }: &mut LineWrapping, + encoded_buffer: &mut VecDeque, + stdout_lock: &mut StdoutLock, + is_cleanup: bool, + ) -> io::Result<()> { + let line_length_usize = *line_length; + + assert!(line_length_usize > 0_usize); + + let number_of_lines = encoded_buffer.len() / line_length_usize; + + // How many bytes to take from the front of `encoded_buffer` and then write to stdout + let number_of_bytes_to_drain = number_of_lines * line_length_usize; + + let line_wrap_size_minus_one = line_length_usize - 1_usize; + + let mut i = 0_usize; + + for ue in encoded_buffer.drain(0_usize..number_of_bytes_to_drain) { + print_buffer.push(ue); + + if i == line_wrap_size_minus_one { + print_buffer.push(b'\n'); + + i = 0_usize; + } else { + i += 1_usize; + } + } + + stdout_lock.write_all(print_buffer)?; + + if is_cleanup { + if encoded_buffer.is_empty() { + // Do not write a newline in this case, because two trailing newlines should never be printed + } else { + // Print the partial line, since this is cleanup and no more data is coming + stdout_lock.write_all(encoded_buffer.make_contiguous())?; + stdout_lock.write_all(b"\n")?; + } + } else { + print_buffer.truncate(0_usize); + } + + Ok(()) + } + + fn write_to_stdout( + line_wrapping_option: &mut Option, + encoded_buffer: &mut VecDeque, + stdout_lock: &mut StdoutLock, + is_cleanup: bool, + ) -> io::Result<()> { + // Write all data in `encoded_buffer` to stdout + if let &mut Some(ref mut li) = line_wrapping_option { + write_with_line_breaks(li, encoded_buffer, stdout_lock, is_cleanup)?; + } else { + write_without_line_breaks(encoded_buffer, stdout_lock, is_cleanup)?; + } + + Ok(()) + } + // End of helper functions + + // TODO + // It turns out the crate being used already supports line wrapping: + // https://docs.rs/data-encoding/latest/data_encoding/struct.Specification.html#wrap-output-when-encoding-1 + // Check if that crate's line wrapping is faster than the wrapping being performed in this function + // `encoding` and `encode_in_chunks_of_size` are passed in a tuple to indicate that they are logically tied + pub fn fast_encode( + input: &mut R, + (encoding, encode_in_chunks_of_size): (&Encoding, usize), + line_wrap: Option, + ) -> UResult<()> { + /// Rust uses 8 kibibytes + /// + /// https://github.com/rust-lang/rust/blob/1a5a2240bc1b8cf0bcce7acb946c78d6493a4fd3/library/std/src/sys_common/io.rs#L3 + const INPUT_BUFFER_SIZE: usize = 8_usize * 1_024_usize; + + let mut line_wrapping_option = match line_wrap { + // Line wrapping is disabled because "-w"/"--wrap" was passed with "0" + Some(0_usize) => None, + // A custom line wrapping value was passed + Some(an) => Some(LineWrapping { + line_length: an, + print_buffer: Vec::::new(), + }), + // Line wrapping was not set, so the default is used + None => Some(LineWrapping { + line_length: WRAP_DEFAULT, + print_buffer: Vec::::new(), + }), + }; + + // Start of buffers + // Data that was read from stdin + let mut input_buffer = vec![0_u8; INPUT_BUFFER_SIZE]; + + assert!(!input_buffer.is_empty()); + + // Data that was read from stdin but has not been encoded yet + let mut leftover_buffer = VecDeque::::new(); + + // Encoded data that needs to be written to stdout + let mut encoded_buffer = VecDeque::::new(); + // End of buffers + + let mut stdout_lock = io::stdout().lock(); + + loop { + match input.read(&mut input_buffer) { + Ok(bytes_read_from_input) => { + if bytes_read_from_input == 0_usize { + break; + } + + // The part of `input_buffer` that was actually filled by the call to `read` + let read_buffer = &input_buffer[0_usize..bytes_read_from_input]; + + // How many bytes to steal from `read_buffer` to get `leftover_buffer` to the right size + let bytes_to_steal = encode_in_chunks_of_size - leftover_buffer.len(); + + if bytes_to_steal > bytes_read_from_input { + // Do not have enough data to encode a chunk, so copy data to `leftover_buffer` and read more + leftover_buffer.extend(read_buffer); + + continue; + } + + // Encode data in chunks, then place it in `encoded_buffer` + { + let bytes_to_chunk = if bytes_to_steal > 0 { + let (stolen_bytes, rest_of_read_buffer) = + read_buffer.split_at(bytes_to_steal); + + leftover_buffer.extend(stolen_bytes); + + // After appending the stolen bytes to `leftover_buffer`, it should be the right size + assert!(leftover_buffer.len() == encode_in_chunks_of_size); + + // Encode the old unencoded data and the stolen bytes, and add the result to + // `encoded_buffer` + encode_append_vec_deque( + encoding, + leftover_buffer.make_contiguous(), + &mut encoded_buffer, + ); + + // Reset `leftover_buffer` + leftover_buffer.truncate(0_usize); + + rest_of_read_buffer + } else { + // Do not need to steal bytes from `read_buffer` + read_buffer + }; + + let chunks_exact = bytes_to_chunk.chunks_exact(encode_in_chunks_of_size); + + let remainder = chunks_exact.remainder(); + + for sl in chunks_exact { + assert!(sl.len() == encode_in_chunks_of_size); + + encode_append_vec_deque(encoding, sl, &mut encoded_buffer); + } + + leftover_buffer.extend(remainder); + } + + // Write all data in `encoded_buffer` to stdout + write_to_stdout( + &mut line_wrapping_option, + &mut encoded_buffer, + &mut stdout_lock, + false, + )?; + } + Err(er) => { + if er.kind() == ErrorKind::Interrupted { + // TODO + // Retry reading? + } + + return Err(USimpleError::new(1_i32, format!("read error: {er}"))); + } + } + } + + // Cleanup + // `input` has finished producing data, so the data remaining in the buffers needs to be encoded and printed + { + // Encode all remaining unencoded bytes, placing it in `encoded_buffer` + encode_append_vec_deque( + encoding, + leftover_buffer.make_contiguous(), + &mut encoded_buffer, + ); + + // Write all data in `encoded_buffer` to stdout + // `is_cleanup` triggers special cleanup-only logic + write_to_stdout( + &mut line_wrapping_option, + &mut encoded_buffer, + &mut stdout_lock, + true, + )?; + } + + Ok(()) } } diff --git a/src/uucore/src/lib/features/encoding.rs b/src/uucore/src/lib/features/encoding.rs index 6a8e5ba221c..4beee6032bb 100644 --- a/src/uucore/src/lib/features/encoding.rs +++ b/src/uucore/src/lib/features/encoding.rs @@ -6,13 +6,19 @@ // spell-checker:ignore (strings) ABCDEFGHIJKLMNOPQRSTUVWXYZ ABCDEFGHIJKLMNOPQRSTUV // spell-checker:ignore (encodings) lsbf msbf hexupper -use std::io::{self, Read, Write}; - +use self::Format::*; use data_encoding::{Encoding, BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER}; use data_encoding_macro::new_encoding; +use std::io::{self, Read, Write}; + #[cfg(feature = "thiserror")] use thiserror::Error; +// Re-export for the faster encoding logic +pub mod for_fast_encode { + pub use data_encoding::*; +} + #[derive(Debug, Error)] pub enum DecodeError { #[error("{}", _0)] @@ -42,13 +48,12 @@ pub enum Format { Base2Msbf, Z85, } -use self::Format::*; -const BASE2LSBF: Encoding = new_encoding! { +pub const BASE2LSBF: Encoding = new_encoding! { symbols: "01", bit_order: LeastSignificantFirst, }; -const BASE2MSBF: Encoding = new_encoding! { +pub const BASE2MSBF: Encoding = new_encoding! { symbols: "01", bit_order: MostSignificantFirst, }; @@ -96,8 +101,6 @@ pub fn decode(f: Format, input: &[u8]) -> DecodeResult { } pub struct Data { - line_wrap: usize, - ignore_garbage: bool, input: R, format: Format, alphabet: &'static [u8], @@ -106,8 +109,6 @@ pub struct Data { impl Data { pub fn new(input: R, format: Format) -> Self { Self { - line_wrap: 76, - ignore_garbage: false, input, format, alphabet: match format { @@ -123,22 +124,10 @@ impl Data { } } - #[must_use] - pub fn line_wrap(mut self, wrap: usize) -> Self { - self.line_wrap = wrap; - self - } - - #[must_use] - pub fn ignore_garbage(mut self, ignore: bool) -> Self { - self.ignore_garbage = ignore; - self - } - - pub fn decode(&mut self) -> DecodeResult { + pub fn decode(&mut self, ignore_garbage: bool) -> DecodeResult { let mut buf = vec![]; self.input.read_to_end(&mut buf)?; - if self.ignore_garbage { + if ignore_garbage { buf.retain(|c| self.alphabet.contains(c)); } else { buf.retain(|&c| c != b'\r' && c != b'\n'); @@ -155,24 +144,27 @@ impl Data { } } -// NOTE: this will likely be phased out at some point -pub fn wrap_print(data: &Data, res: &str) { +pub fn wrap_print(res: &str, line_wrap: usize) -> io::Result<()> { let stdout = io::stdout(); - wrap_write(stdout.lock(), data.line_wrap, res).unwrap(); -} -pub fn wrap_write(mut writer: W, line_wrap: usize, res: &str) -> io::Result<()> { - use std::cmp::min; + let mut stdout_lock = stdout.lock(); if line_wrap == 0 { - return write!(writer, "{res}"); - } + stdout_lock.write_all(res.as_bytes())?; + } else { + let res_len = res.len(); + + let mut start = 0; - let mut start = 0; - while start < res.len() { - let end = min(start + line_wrap, res.len()); - writeln!(writer, "{}", &res[start..end])?; - start = end; + while start < res_len { + let start_plus_line_wrap = start + line_wrap; + + let end = start_plus_line_wrap.min(res_len); + + writeln!(stdout_lock, "{}", &res[start..end])?; + + start = end; + } } Ok(()) diff --git a/tests/by-util/test_base64.rs b/tests/by-util/test_base64.rs index 403fd7db86a..b75ab0fba8a 100644 --- a/tests/by-util/test_base64.rs +++ b/tests/by-util/test_base64.rs @@ -146,3 +146,36 @@ fn test_base64_file_not_found() { .fails() .stderr_only("base64: a.txt: No such file or directory\n"); } + +#[test] +fn test_no_repeated_trailing_newline() { + new_ucmd!() + .args(&["--wrap", "10", "--", "-"]) + .pipe_in("The quick brown fox jumps over the lazy dog.") + .succeeds() + .stdout_only( + "\ +VGhlIHF1aW +NrIGJyb3du +IGZveCBqdW +1wcyBvdmVy +IHRoZSBsYX +p5IGRvZy4= +", + ); +} + +#[test] +fn test_wrap_default() { + new_ucmd!() + .args(&["--", "-"]) + .pipe_in("The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.") + .succeeds() + .stdout_only( + "\ +VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wcyBvdmVyIHRoZSBsYXp5IGRvZy4gVGhlIHF1aWNrIGJy +b3duIGZveCBqdW1wcyBvdmVyIHRoZSBsYXp5IGRvZy4gVGhlIHF1aWNrIGJyb3duIGZveCBqdW1w +cyBvdmVyIHRoZSBsYXp5IGRvZy4= +", + ); +} diff --git a/tests/by-util/test_basenc.rs b/tests/by-util/test_basenc.rs index 18f0502a1da..fa49bd9714f 100644 --- a/tests/by-util/test_basenc.rs +++ b/tests/by-util/test_basenc.rs @@ -26,7 +26,9 @@ fn test_invalid_input() { let error_message = if cfg!(windows) { "basenc: .: Permission denied\n" } else { - "basenc: error: invalid input\n" + // TODO + // Other implementations do not show " (os error 21)" + "basenc: read error: Is a directory (os error 21)\n" }; new_ucmd!() .args(&["--base32", "."]) From 167586f6fbf39f2eea484ba4ea8fc2e5109c3da9 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Fri, 20 Sep 2024 15:40:22 -0500 Subject: [PATCH 02/17] Fix compilation error and spelling warning --- src/uu/base32/src/base_common.rs | 24 +++++++++++++----------- tests/by-util/test_base64.rs | 2 ++ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index f838bac09c1..c3066e89282 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -230,13 +230,13 @@ pub fn handle_input( } // For these, use faster, new encoding logic - Format::Base16 => (&HEXUPPER, BASE16_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base2Lsbf => (&BASE2LSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base2Msbf => (&BASE2MSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base32 => (&BASE32, BASE32_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base32Hex => (&BASE32HEX, BASE32_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base64 => (&BASE64, BASE64_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base64Url => (&BASE64URL, BASE64_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base16 => (HEXUPPER, BASE16_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base2Lsbf => (BASE2LSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base2Msbf => (BASE2MSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base32 => (BASE32, BASE32_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base32Hex => (BASE32HEX, BASE32_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base64 => (BASE64, BASE64_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base64Url => (BASE64URL, BASE64_ENCODE_IN_CHUNKS_OF_SIZE), }; fast_encode::fast_encode(input, encoding_and_encode_in_chunks_of_size, wrap)?; @@ -365,10 +365,12 @@ mod fast_encode { // It turns out the crate being used already supports line wrapping: // https://docs.rs/data-encoding/latest/data_encoding/struct.Specification.html#wrap-output-when-encoding-1 // Check if that crate's line wrapping is faster than the wrapping being performed in this function + // Update: That crate does not support arbitrary width line wrapping. It only supports certain widths: + // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L1710 // `encoding` and `encode_in_chunks_of_size` are passed in a tuple to indicate that they are logically tied pub fn fast_encode( input: &mut R, - (encoding, encode_in_chunks_of_size): (&Encoding, usize), + (encoding, encode_in_chunks_of_size): (Encoding, usize), line_wrap: Option, ) -> UResult<()> { /// Rust uses 8 kibibytes @@ -440,7 +442,7 @@ mod fast_encode { // Encode the old unencoded data and the stolen bytes, and add the result to // `encoded_buffer` encode_append_vec_deque( - encoding, + &encoding, leftover_buffer.make_contiguous(), &mut encoded_buffer, ); @@ -461,7 +463,7 @@ mod fast_encode { for sl in chunks_exact { assert!(sl.len() == encode_in_chunks_of_size); - encode_append_vec_deque(encoding, sl, &mut encoded_buffer); + encode_append_vec_deque(&encoding, sl, &mut encoded_buffer); } leftover_buffer.extend(remainder); @@ -491,7 +493,7 @@ mod fast_encode { { // Encode all remaining unencoded bytes, placing it in `encoded_buffer` encode_append_vec_deque( - encoding, + &encoding, leftover_buffer.make_contiguous(), &mut encoded_buffer, ); diff --git a/tests/by-util/test_base64.rs b/tests/by-util/test_base64.rs index b75ab0fba8a..a35854ee888 100644 --- a/tests/by-util/test_base64.rs +++ b/tests/by-util/test_base64.rs @@ -4,6 +4,8 @@ // file that was distributed with this source code. use crate::common::util::TestScenario; +// spell-checker:ignore Bvdm + #[test] fn test_encode() { let input = "hello, world!"; From eaf74cd1f622da471761431a7f37a2e3fd362eaa Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Fri, 20 Sep 2024 16:23:13 -0500 Subject: [PATCH 03/17] Fix spelling warning --- tests/by-util/test_base64.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/by-util/test_base64.rs b/tests/by-util/test_base64.rs index a35854ee888..b0a89b4c2ae 100644 --- a/tests/by-util/test_base64.rs +++ b/tests/by-util/test_base64.rs @@ -4,8 +4,6 @@ // file that was distributed with this source code. use crate::common::util::TestScenario; -// spell-checker:ignore Bvdm - #[test] fn test_encode() { let input = "hello, world!"; @@ -156,6 +154,7 @@ fn test_no_repeated_trailing_newline() { .pipe_in("The quick brown fox jumps over the lazy dog.") .succeeds() .stdout_only( + // cSpell:disable "\ VGhlIHF1aW NrIGJyb3du @@ -164,20 +163,25 @@ IGZveCBqdW IHRoZSBsYX p5IGRvZy4= ", + // cSpell:enable ); } #[test] fn test_wrap_default() { + const PIPE_IN: &str = "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog."; + new_ucmd!() .args(&["--", "-"]) - .pipe_in("The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.") + .pipe_in(PIPE_IN) .succeeds() .stdout_only( + // cSpell:disable "\ VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wcyBvdmVyIHRoZSBsYXp5IGRvZy4gVGhlIHF1aWNrIGJy b3duIGZveCBqdW1wcyBvdmVyIHRoZSBsYXp5IGRvZy4gVGhlIHF1aWNrIGJyb3duIGZveCBqdW1w cyBvdmVyIHRoZSBsYXp5IGRvZy4= ", + // cSpell:enable ); } From 4d71c10279f535aea4f91083a0b8d3906af5042a Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Fri, 20 Sep 2024 17:09:47 -0500 Subject: [PATCH 04/17] Replace truncate with clear --- src/uu/base32/src/base_common.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index c3066e89282..81a8401f027 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -287,7 +287,7 @@ mod fast_encode { if is_cleanup { stdout_lock.write_all(b"\n")?; } else { - encoded_buffer.truncate(0_usize); + encoded_buffer.clear(); } Ok(()) @@ -338,7 +338,7 @@ mod fast_encode { stdout_lock.write_all(b"\n")?; } } else { - print_buffer.truncate(0_usize); + print_buffer.clear(); } Ok(()) @@ -448,7 +448,7 @@ mod fast_encode { ); // Reset `leftover_buffer` - leftover_buffer.truncate(0_usize); + leftover_buffer.clear(); rest_of_read_buffer } else { From ed04f14ba8c9a95d6c81816d2a44c4f84feacfa8 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sat, 21 Sep 2024 04:02:09 -0500 Subject: [PATCH 05/17] basenc: also perform faster, streaming decoding Same as previous changes, just applied to decoding --- src/uu/base32/src/base_common.rs | 364 +++++++++++++++++++++--- src/uu/cksum/src/cksum.rs | 3 +- src/uucore/src/lib/features/encoding.rs | 158 +++++----- 3 files changed, 397 insertions(+), 128 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index 81a8401f027..90a3b15b97d 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -11,11 +11,11 @@ use std::io::{stdout, Read, Write}; use std::io::{BufReader, Stdin}; use std::path::Path; use uucore::display::Quotable; +use uucore::encoding::{decode_z_eight_five, encode_z_eight_five, BASE2LSBF, BASE2MSBF}; use uucore::encoding::{ for_fast_encode::{BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER}, - wrap_print, Data, EncodeError, Format, + wrap_print, EncodeError, Format, }; -use uucore::encoding::{BASE2LSBF, BASE2MSBF}; use uucore::error::{FromIo, UResult, USimpleError, UUsageError}; use uucore::format_usage; @@ -160,7 +160,7 @@ pub fn handle_input( ignore_garbage: bool, decode: bool, ) -> UResult<()> { - const ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024_usize; + const DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024_usize; // These constants indicate that inputs with lengths divisible by these numbers will have no padding characters // after encoding. @@ -174,54 +174,110 @@ pub fn handle_input( // "VGhlIHF1aWNrIGJyb3duIGZveA==" // The encoding logic in this function depends on these constants being correct, so do not modify // them. Performance can be tuned by multiplying these numbers by a different multiple (see - // `ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE` above). + // `DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE` above). const BASE16_UN_PADDED_MULTIPLE: usize = 1_usize; const BASE2_UN_PADDED_MULTIPLE: usize = 1_usize; const BASE32_UN_PADDED_MULTIPLE: usize = 5_usize; const BASE64_UN_PADDED_MULTIPLE: usize = 3_usize; const BASE16_ENCODE_IN_CHUNKS_OF_SIZE: usize = - BASE16_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + BASE16_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; const BASE2_ENCODE_IN_CHUNKS_OF_SIZE: usize = - BASE2_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + BASE2_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; const BASE32_ENCODE_IN_CHUNKS_OF_SIZE: usize = - BASE32_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + BASE32_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; const BASE64_ENCODE_IN_CHUNKS_OF_SIZE: usize = - BASE64_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + BASE64_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + + const BASE16_VALID_DECODING_MULTIPLE: usize = 2_usize; + const BASE2_VALID_DECODING_MULTIPLE: usize = 8_usize; + const BASE32_VALID_DECODING_MULTIPLE: usize = 8_usize; + const BASE64_VALID_DECODING_MULTIPLE: usize = 4_usize; + + const BASE16_DECODE_IN_CHUNKS_OF_SIZE: usize = + BASE16_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + const BASE2_DECODE_IN_CHUNKS_OF_SIZE: usize = + BASE2_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + const BASE32_DECODE_IN_CHUNKS_OF_SIZE: usize = + BASE32_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + const BASE64_DECODE_IN_CHUNKS_OF_SIZE: usize = + BASE64_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; if decode { - let mut data = Data::new(input, format); - - match data.decode(ignore_garbage) { - Ok(s) => { - // Silent the warning as we want to the error message - #[allow(clippy::question_mark)] - if stdout().write_all(&s).is_err() { - // on windows console, writing invalid utf8 returns an error - return Err(USimpleError::new(1, "error: cannot write non-utf8 data")); - } - Ok(()) + let encoding_and_decode_in_chunks_of_size_and_alphabet: (_, _, &[u8]) = match format { + // Use naive approach for Z85, since the crate being used doesn't have the API needed + Format::Z85 => { + let result = match decode_z_eight_five(input, ignore_garbage) { + Ok(ve) => { + if stdout().write_all(&ve).is_err() { + // on windows console, writing invalid utf8 returns an error + return Err(USimpleError::new( + 1_i32, + "error: cannot write non-utf8 data", + )); + } + + Ok(()) + } + Err(_) => Err(USimpleError::new(1_i32, "error: invalid input")), + }; + + return result; } - Err(_) => Err(USimpleError::new(1, "error: invalid input")), - } + + // For these, use faster, new decoding logic + Format::Base16 => ( + HEXUPPER, + BASE16_DECODE_IN_CHUNKS_OF_SIZE, + b"0123456789ABCDEF", + ), + Format::Base2Lsbf => (BASE2LSBF, BASE2_DECODE_IN_CHUNKS_OF_SIZE, b"01"), + Format::Base2Msbf => (BASE2MSBF, BASE2_DECODE_IN_CHUNKS_OF_SIZE, b"01"), + Format::Base32 => ( + BASE32, + BASE32_DECODE_IN_CHUNKS_OF_SIZE, + b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=", + ), + Format::Base32Hex => ( + BASE32HEX, + BASE32_DECODE_IN_CHUNKS_OF_SIZE, + // spell-checker:disable-next-line + b"0123456789ABCDEFGHIJKLMNOPQRSTUV=", + ), + Format::Base64 => ( + BASE64, + BASE64_DECODE_IN_CHUNKS_OF_SIZE, + b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=+/", + ), + Format::Base64Url => ( + BASE64URL, + BASE64_DECODE_IN_CHUNKS_OF_SIZE, + b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=_-", + ), + }; + + fast_decode::fast_decode( + input, + encoding_and_decode_in_chunks_of_size_and_alphabet, + ignore_garbage, + )?; + + Ok(()) } else { - #[allow(clippy::identity_op)] let encoding_and_encode_in_chunks_of_size = match format { // Use naive approach for Z85, since the crate being used doesn't have the API needed Format::Z85 => { - let mut data = Data::new(input, format); - - let result = match data.encode() { + let result = match encode_z_eight_five(input) { Ok(st) => { wrap_print(&st, wrap.unwrap_or(WRAP_DEFAULT))?; Ok(()) } Err(EncodeError::InvalidInput) => { - Err(USimpleError::new(1, "error: invalid input")) + Err(USimpleError::new(1_i32, "error: invalid input")) } Err(_) => Err(USimpleError::new( - 1, + 1_i32, "error: invalid input (length must be multiple of 4 characters)", )), }; @@ -315,7 +371,7 @@ mod fast_encode { let mut i = 0_usize; - for ue in encoded_buffer.drain(0_usize..number_of_bytes_to_drain) { + for ue in encoded_buffer.drain(..number_of_bytes_to_drain) { print_buffer.push(ue); if i == line_wrap_size_minus_one { @@ -367,7 +423,8 @@ mod fast_encode { // Check if that crate's line wrapping is faster than the wrapping being performed in this function // Update: That crate does not support arbitrary width line wrapping. It only supports certain widths: // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L1710 - // `encoding` and `encode_in_chunks_of_size` are passed in a tuple to indicate that they are logically tied + // + /// `encoding` and `encode_in_chunks_of_size` are passed in a tuple to indicate that they are logically tied pub fn fast_encode( input: &mut R, (encoding, encode_in_chunks_of_size): (Encoding, usize), @@ -416,7 +473,7 @@ mod fast_encode { } // The part of `input_buffer` that was actually filled by the call to `read` - let read_buffer = &input_buffer[0_usize..bytes_read_from_input]; + let read_buffer = &input_buffer[..bytes_read_from_input]; // How many bytes to steal from `read_buffer` to get `leftover_buffer` to the right size let bytes_to_steal = encode_in_chunks_of_size - leftover_buffer.len(); @@ -430,7 +487,7 @@ mod fast_encode { // Encode data in chunks, then place it in `encoded_buffer` { - let bytes_to_chunk = if bytes_to_steal > 0 { + let bytes_to_chunk = if bytes_to_steal > 0_usize { let (stolen_bytes, rest_of_read_buffer) = read_buffer.split_at(bytes_to_steal); @@ -491,7 +548,7 @@ mod fast_encode { // Cleanup // `input` has finished producing data, so the data remaining in the buffers needs to be encoded and printed { - // Encode all remaining unencoded bytes, placing it in `encoded_buffer` + // Encode all remaining unencoded bytes, placing them in `encoded_buffer` encode_append_vec_deque( &encoding, leftover_buffer.make_contiguous(), @@ -511,3 +568,246 @@ mod fast_encode { Ok(()) } } + +mod fast_decode { + use std::io::{self, ErrorKind, Read, StdoutLock, Write}; + use uucore::{ + encoding::{alphabet_to_table, for_fast_encode::Encoding}, + error::{UResult, USimpleError}, + }; + + struct FilteringData { + table: [bool; 256_usize], + } + + // Start of helper functions + // Adapted from `decode` in the "data-encoding" crate + fn decode_into_vec(encoding: &Encoding, input: &[u8], output: &mut Vec) -> UResult<()> { + let decode_len_result = match encoding.decode_len(input.len()) { + Ok(us) => us, + Err(de) => { + return Err(USimpleError::new(1_i32, format!("{de}"))); + } + }; + + let output_len = output.len(); + + output.resize(output_len + decode_len_result, 0_u8); + + match encoding.decode_mut(input, &mut (output[output_len..])) { + Ok(us) => { + // See: + // https://docs.rs/data-encoding/latest/data_encoding/struct.Encoding.html#method.decode_mut + // "Returns the length of the decoded output. This length may be smaller than the output length if the input contained padding or ignored characters. The output bytes after the returned length are not initialized and should not be read." + output.truncate(output_len + us); + } + Err(_de) => { + return Err(USimpleError::new(1_i32, "error: invalid input".to_owned())); + } + } + + Ok(()) + } + + fn write_to_stdout( + decoded_buffer: &mut Vec, + stdout_lock: &mut StdoutLock, + ) -> io::Result<()> { + // Write all data in `decoded_buffer` to stdout + stdout_lock.write_all(decoded_buffer.as_slice())?; + + decoded_buffer.clear(); + + Ok(()) + } + // End of helper functions + + /// `encoding`, `decode_in_chunks_of_size`, and `alphabet` are passed in a tuple to indicate that they are + /// logically tied + pub fn fast_decode( + input: &mut R, + (encoding, decode_in_chunks_of_size, alphabet): (Encoding, usize, &[u8]), + ignore_garbage: bool, + ) -> UResult<()> { + /// Rust uses 8 kibibytes + /// + /// https://github.com/rust-lang/rust/blob/1a5a2240bc1b8cf0bcce7acb946c78d6493a4fd3/library/std/src/sys_common/io.rs#L3 + const INPUT_BUFFER_SIZE: usize = 8_usize * 1_024_usize; + + // Note that it's not worth using "data-encoding"'s ignore functionality if "ignore_garbage" is true, because + // "data-encoding"'s ignore functionality cannot discard non-ASCII bytes. The data has to be filtered before + // passing it to "data-encoding", so there is no point in doing any filtering in "data-encoding". This also + // allows execution to stay on the happy path in "data-encoding": + // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L754-L756 + let (encoding_to_use, filter_data_option) = { + if ignore_garbage { + // Note that the alphabet constants above already include the padding characters + // TODO + // Precompute this + let table = alphabet_to_table(alphabet); + + (encoding, Some(FilteringData { table })) + } else { + let mut sp = encoding.specification(); + + // '\n' and '\r' are always ignored + sp.ignore = "\n\r".to_owned(); + + let en = match sp.encoding() { + Ok(en) => en, + Err(sp) => { + return Err(USimpleError::new(1_i32, format!("{sp}"))); + } + }; + + (en, None) + } + }; + + // Start of buffers + // Data that was read from stdin + let mut input_buffer = vec![0_u8; INPUT_BUFFER_SIZE]; + + assert!(!input_buffer.is_empty()); + + // Data that was read from stdin but has not been decoded yet + let mut leftover_buffer = Vec::::new(); + + // Decoded data that needs to be written to stdout + let mut decoded_buffer = Vec::::new(); + + // Buffer that will be used when "ignore_garbage" is true, and the chunk read from "input" contains garbage + // data + let mut non_garbage_buffer = Vec::::new(); + // End of buffers + + let mut stdout_lock = io::stdout().lock(); + + loop { + match input.read(&mut input_buffer) { + Ok(bytes_read_from_input) => { + if bytes_read_from_input == 0_usize { + break; + } + + let read_buffer_filtered = { + // The part of `input_buffer` that was actually filled by the call to `read` + let read_buffer = &input_buffer[..bytes_read_from_input]; + + if let Some(fi) = &filter_data_option { + let FilteringData { table } = fi; + + let table_to_owned = table.to_owned(); + + // First just scan the data for the happy path + // Note: this happy path check has not been validated with performance testing + let mut found_garbage = false; + + for ue in read_buffer { + if table_to_owned[usize::from(*ue)] { + // Not garbage, since it was found in the table + continue; + } else { + found_garbage = true; + + break; + } + } + + if found_garbage { + non_garbage_buffer.clear(); + + for ue in read_buffer { + if table_to_owned[usize::from(*ue)] { + // Not garbage, since it was found in the table + non_garbage_buffer.push(*ue); + } + } + + non_garbage_buffer.as_slice() + } else { + read_buffer + } + } else { + read_buffer + } + }; + + // How many bytes to steal from `read_buffer` to get `leftover_buffer` to the right size + let bytes_to_steal = decode_in_chunks_of_size - leftover_buffer.len(); + + if bytes_to_steal > bytes_read_from_input { + // Do not have enough data to decode a chunk, so copy data to `leftover_buffer` and read more + leftover_buffer.extend(read_buffer_filtered); + + continue; + } + + // Decode data in chunks, then place it in `decoded_buffer` + { + let bytes_to_chunk = if bytes_to_steal > 0_usize { + let (stolen_bytes, rest_of_read_buffer_filtered) = + read_buffer_filtered.split_at(bytes_to_steal); + + leftover_buffer.extend(stolen_bytes); + + // After appending the stolen bytes to `leftover_buffer`, it should be the right size + assert!(leftover_buffer.len() == decode_in_chunks_of_size); + + // Decode the old un-decoded data and the stolen bytes, and add the result to + // `decoded_buffer` + decode_into_vec( + &encoding_to_use, + &leftover_buffer, + &mut decoded_buffer, + )?; + + // Reset `leftover_buffer` + leftover_buffer.clear(); + + rest_of_read_buffer_filtered + } else { + // Do not need to steal bytes from `read_buffer` + read_buffer_filtered + }; + + let chunks_exact = bytes_to_chunk.chunks_exact(decode_in_chunks_of_size); + + let remainder = chunks_exact.remainder(); + + for sl in chunks_exact { + assert!(sl.len() == decode_in_chunks_of_size); + + decode_into_vec(&encoding_to_use, sl, &mut decoded_buffer)?; + } + + leftover_buffer.extend(remainder); + } + + // Write all data in `decoded_buffer` to stdout + write_to_stdout(&mut decoded_buffer, &mut stdout_lock)?; + } + Err(er) => { + if er.kind() == ErrorKind::Interrupted { + // TODO + // Retry reading? + } + + return Err(USimpleError::new(1_i32, format!("read error: {er}"))); + } + } + } + + // Cleanup + // `input` has finished producing data, so the data remaining in the buffers needs to be decoded and printed + { + // Decode all remaining encoded bytes, placing them in `decoded_buffer` + decode_into_vec(&encoding_to_use, &leftover_buffer, &mut decoded_buffer)?; + + // Write all data in `decoded_buffer` to stdout + write_to_stdout(&mut decoded_buffer, &mut stdout_lock)?; + } + + Ok(()) + } +} diff --git a/src/uu/cksum/src/cksum.rs b/src/uu/cksum/src/cksum.rs index 7ba9f78de6a..b9654ca8654 100644 --- a/src/uu/cksum/src/cksum.rs +++ b/src/uu/cksum/src/cksum.rs @@ -111,8 +111,7 @@ where OutputFormat::Hexadecimal => sum_hex, OutputFormat::Base64 => match options.algo_name { ALGORITHM_OPTIONS_CRC | ALGORITHM_OPTIONS_SYSV | ALGORITHM_OPTIONS_BSD => sum_hex, - _ => encoding::encode(encoding::Format::Base64, &hex::decode(sum_hex).unwrap()) - .unwrap(), + _ => encoding::encode_base_six_four(&hex::decode(sum_hex).unwrap()), }, }; // The BSD checksum output is 5 digit integer diff --git a/src/uucore/src/lib/features/encoding.rs b/src/uucore/src/lib/features/encoding.rs index 4beee6032bb..8befec8ef62 100644 --- a/src/uucore/src/lib/features/encoding.rs +++ b/src/uucore/src/lib/features/encoding.rs @@ -4,39 +4,26 @@ // file that was distributed with this source code. // spell-checker:ignore (strings) ABCDEFGHIJKLMNOPQRSTUVWXYZ ABCDEFGHIJKLMNOPQRSTUV -// spell-checker:ignore (encodings) lsbf msbf hexupper +// spell-checker:ignore (encodings) lsbf msbf -use self::Format::*; -use data_encoding::{Encoding, BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER}; +use data_encoding::{Encoding, BASE64}; use data_encoding_macro::new_encoding; -use std::io::{self, Read, Write}; - -#[cfg(feature = "thiserror")] -use thiserror::Error; +use std::{ + error::Error, + io::{self, Read, Write}, +}; // Re-export for the faster encoding logic pub mod for_fast_encode { pub use data_encoding::*; } -#[derive(Debug, Error)] -pub enum DecodeError { - #[error("{}", _0)] - Decode(#[from] data_encoding::DecodeError), - #[error("{}", _0)] - DecodeZ85(#[from] z85::DecodeError), - #[error("{}", _0)] - Io(#[from] io::Error), -} - #[derive(Debug)] pub enum EncodeError { Z85InputLenNotMultipleOf4, InvalidInput, } -pub type DecodeResult = Result, DecodeError>; - #[derive(Clone, Copy)] pub enum Format { Base64, @@ -53,94 +40,62 @@ pub const BASE2LSBF: Encoding = new_encoding! { symbols: "01", bit_order: LeastSignificantFirst, }; + pub const BASE2MSBF: Encoding = new_encoding! { symbols: "01", bit_order: MostSignificantFirst, }; -pub fn encode(f: Format, input: &[u8]) -> Result { - Ok(match f { - Base32 => BASE32.encode(input), - Base64 => BASE64.encode(input), - Base64Url => BASE64URL.encode(input), - Base32Hex => BASE32HEX.encode(input), - Base16 => HEXUPPER.encode(input), - Base2Lsbf => BASE2LSBF.encode(input), - Base2Msbf => BASE2MSBF.encode(input), - Z85 => { - // According to the spec we should not accept inputs whose len is not a multiple of 4. - // However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them. - if input.len() % 4 == 0 { - z85::encode(input) - } else { - return Err(EncodeError::Z85InputLenNotMultipleOf4); - } - } - }) +pub fn encode_base_six_four(input: &[u8]) -> String { + BASE64.encode(input) } -pub fn decode(f: Format, input: &[u8]) -> DecodeResult { - Ok(match f { - Base32 => BASE32.decode(input)?, - Base64 => BASE64.decode(input)?, - Base64Url => BASE64URL.decode(input)?, - Base32Hex => BASE32HEX.decode(input)?, - Base16 => HEXUPPER.decode(input)?, - Base2Lsbf => BASE2LSBF.decode(input)?, - Base2Msbf => BASE2MSBF.decode(input)?, - Z85 => { - // The z85 crate implements a padded encoding by using a leading '#' which is otherwise not allowed. - // We manually check for a leading '#' and return an error ourselves. - if input.starts_with(b"#") { - return Err(z85::DecodeError::InvalidByte(0, b'#').into()); - } else { - z85::decode(input)? - } - } - }) -} +pub fn decode_z_eight_five( + mut input: R, + ignore_garbage: bool, +) -> Result, Box> { + const Z_EIGHT_FIVE_ALPHABET: &[u8; 85_usize] = + b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#"; + + let mut buf = Vec::::new(); + + input.read_to_end(&mut buf)?; + + if ignore_garbage { + let table = alphabet_to_table(Z_EIGHT_FIVE_ALPHABET); -pub struct Data { - input: R, - format: Format, - alphabet: &'static [u8], + buf.retain(|&ue| table[usize::from(ue)]); + } else { + buf.retain(|&ue| ue != b'\n' && ue != b'\r'); + }; + + // The z85 crate implements a padded encoding by using a leading '#' which is otherwise not allowed. + // We manually check for a leading '#' and return an error ourselves. + let vec = if buf.starts_with(b"#") { + return Err(Box::from("'#' character at index 0 is invalid".to_owned())); + } else { + z85::decode(buf)? + }; + + Ok(vec) } -impl Data { - pub fn new(input: R, format: Format) -> Self { - Self { - input, - format, - alphabet: match format { - Base32 => b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=", - Base64 => b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=+/", - Base64Url => b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=_-", - Base32Hex => b"0123456789ABCDEFGHIJKLMNOPQRSTUV=", - Base16 => b"0123456789ABCDEF", - Base2Lsbf => b"01", - Base2Msbf => b"01", - Z85 => b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#", - }, - } - } +pub fn encode_z_eight_five(mut input: R) -> Result { + let mut buf = Vec::::new(); - pub fn decode(&mut self, ignore_garbage: bool) -> DecodeResult { - let mut buf = vec![]; - self.input.read_to_end(&mut buf)?; - if ignore_garbage { - buf.retain(|c| self.alphabet.contains(c)); - } else { - buf.retain(|&c| c != b'\r' && c != b'\n'); - }; - decode(self.format, &buf) - } + match input.read_to_end(&mut buf) { + Ok(_) => { + let buf_slice = buf.as_slice(); - pub fn encode(&mut self) -> Result { - let mut buf: Vec = vec![]; - match self.input.read_to_end(&mut buf) { - Ok(_) => encode(self.format, buf.as_slice()), - Err(_) => Err(EncodeError::InvalidInput), + // According to the spec we should not accept inputs whose len is not a multiple of 4. + // However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them. + if buf_slice.len() % 4_usize == 0_usize { + Ok(z85::encode(buf_slice)) + } else { + Err(EncodeError::Z85InputLenNotMultipleOf4) + } } + Err(_) => Err(EncodeError::InvalidInput), } } @@ -169,3 +124,18 @@ pub fn wrap_print(res: &str, line_wrap: usize) -> io::Result<()> { Ok(()) } + +pub fn alphabet_to_table(alphabet: &[u8]) -> [bool; 256_usize] { + let mut table = [false; 256_usize]; + + for ue in alphabet { + let us = usize::from(*ue); + + // Should not have been set yet + assert!(!table[us]); + + table[us] = true; + } + + table +} From 8a6923dcd4cba703a5cbb8f706dfc95f47ce7c9e Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sat, 21 Sep 2024 09:35:20 -0500 Subject: [PATCH 06/17] Fix line wrapping encoding performance --- src/uu/base32/src/base_common.rs | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index 90a3b15b97d..fa381734914 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -362,29 +362,25 @@ mod fast_encode { assert!(line_length_usize > 0_usize); - let number_of_lines = encoded_buffer.len() / line_length_usize; + let make_contiguous_result = encoded_buffer.make_contiguous(); // How many bytes to take from the front of `encoded_buffer` and then write to stdout - let number_of_bytes_to_drain = number_of_lines * line_length_usize; + // (Number of whole lines times the line length) + let number_of_bytes_to_drain = + (make_contiguous_result.len() / line_length_usize) * line_length_usize; - let line_wrap_size_minus_one = line_length_usize - 1_usize; + let chunks_exact = make_contiguous_result.chunks_exact(line_length_usize); - let mut i = 0_usize; - - for ue in encoded_buffer.drain(..number_of_bytes_to_drain) { - print_buffer.push(ue); - - if i == line_wrap_size_minus_one { - print_buffer.push(b'\n'); - - i = 0_usize; - } else { - i += 1_usize; - } + for sl in chunks_exact { + print_buffer.extend_from_slice(sl); + print_buffer.push(b'\n'); } stdout_lock.write_all(print_buffer)?; + // Remove the bytes that were just printed from `encoded_buffer` + drop(encoded_buffer.drain(..number_of_bytes_to_drain)); + if is_cleanup { if encoded_buffer.is_empty() { // Do not write a newline in this case, because two trailing newlines should never be printed From 5cd050665d1ebf7d08cecfca667bbe86d2ad8ed0 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sat, 21 Sep 2024 10:31:43 -0500 Subject: [PATCH 07/17] Get number of bytes to drain in a safer way --- src/uu/base32/src/base_common.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index fa381734914..ffde6c7e5d9 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -364,14 +364,13 @@ mod fast_encode { let make_contiguous_result = encoded_buffer.make_contiguous(); - // How many bytes to take from the front of `encoded_buffer` and then write to stdout - // (Number of whole lines times the line length) - let number_of_bytes_to_drain = - (make_contiguous_result.len() / line_length_usize) * line_length_usize; - let chunks_exact = make_contiguous_result.chunks_exact(line_length_usize); + let mut bytes_added_to_print_buffer = 0_usize; + for sl in chunks_exact { + bytes_added_to_print_buffer += sl.len(); + print_buffer.extend_from_slice(sl); print_buffer.push(b'\n'); } @@ -379,7 +378,7 @@ mod fast_encode { stdout_lock.write_all(print_buffer)?; // Remove the bytes that were just printed from `encoded_buffer` - drop(encoded_buffer.drain(..number_of_bytes_to_drain)); + drop(encoded_buffer.drain(..bytes_added_to_print_buffer)); if is_cleanup { if encoded_buffer.is_empty() { From cdebd247333c6c277cb6255ac65fb16a588e4939 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sun, 22 Sep 2024 08:28:20 -0500 Subject: [PATCH 08/17] Finish fast decode/encode by folding in Z85 --- src/uu/base32/src/base_common.rs | 336 ++++++++++-------------- src/uu/cksum/src/cksum.rs | 2 +- src/uucore/Cargo.toml | 2 +- src/uucore/src/lib/features/encoding.rs | 159 +++++------ tests/by-util/test_basenc.rs | 7 +- 5 files changed, 233 insertions(+), 273 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index ffde6c7e5d9..a9990757d5d 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -7,14 +7,12 @@ use clap::{crate_version, Arg, ArgAction, Command}; use std::fs::File; -use std::io::{stdout, Read, Write}; -use std::io::{BufReader, Stdin}; +use std::io::{BufReader, Read, Stdin}; use std::path::Path; use uucore::display::Quotable; -use uucore::encoding::{decode_z_eight_five, encode_z_eight_five, BASE2LSBF, BASE2MSBF}; use uucore::encoding::{ for_fast_encode::{BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER}, - wrap_print, EncodeError, Format, + Format, ZEightFiveWrapper, BASE2LSBF, BASE2MSBF, }; use uucore::error::{FromIo, UResult, USimpleError, UUsageError}; use uucore::format_usage; @@ -172,130 +170,134 @@ pub fn handle_input( // "The quick brown fox" // is 19 characters, which is not divisible by 3, so its Base64 representation has padding: // "VGhlIHF1aWNrIGJyb3duIGZveA==" - // The encoding logic in this function depends on these constants being correct, so do not modify - // them. Performance can be tuned by multiplying these numbers by a different multiple (see - // `DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE` above). + // + // The encoding performed by `fast_encode` depend on these constants being correct. Performance can be tuned by + // multiplying these numbers by a different multiple (see `DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE` above). const BASE16_UN_PADDED_MULTIPLE: usize = 1_usize; const BASE2_UN_PADDED_MULTIPLE: usize = 1_usize; const BASE32_UN_PADDED_MULTIPLE: usize = 5_usize; const BASE64_UN_PADDED_MULTIPLE: usize = 3_usize; + const Z85_UN_PADDED_MULTIPLE: usize = 4_usize; - const BASE16_ENCODE_IN_CHUNKS_OF_SIZE: usize = - BASE16_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; - const BASE2_ENCODE_IN_CHUNKS_OF_SIZE: usize = - BASE2_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; - const BASE32_ENCODE_IN_CHUNKS_OF_SIZE: usize = - BASE32_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; - const BASE64_ENCODE_IN_CHUNKS_OF_SIZE: usize = - BASE64_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; - + // Similar to above, but for decoding const BASE16_VALID_DECODING_MULTIPLE: usize = 2_usize; const BASE2_VALID_DECODING_MULTIPLE: usize = 8_usize; const BASE32_VALID_DECODING_MULTIPLE: usize = 8_usize; const BASE64_VALID_DECODING_MULTIPLE: usize = 4_usize; - - const BASE16_DECODE_IN_CHUNKS_OF_SIZE: usize = - BASE16_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; - const BASE2_DECODE_IN_CHUNKS_OF_SIZE: usize = - BASE2_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; - const BASE32_DECODE_IN_CHUNKS_OF_SIZE: usize = - BASE32_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; - const BASE64_DECODE_IN_CHUNKS_OF_SIZE: usize = - BASE64_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + const Z85_VALID_DECODING_MULTIPLE: usize = 5_usize; if decode { - let encoding_and_decode_in_chunks_of_size_and_alphabet: (_, _, &[u8]) = match format { - // Use naive approach for Z85, since the crate being used doesn't have the API needed + let (encoding, valid_decoding_multiple, alphabet): (_, _, &[u8]) = match format { + // Use naive approach (now only semi-naive) for Z85, since the crate being used doesn't have the API + // needed Format::Z85 => { - let result = match decode_z_eight_five(input, ignore_garbage) { - Ok(ve) => { - if stdout().write_all(&ve).is_err() { - // on windows console, writing invalid utf8 returns an error - return Err(USimpleError::new( - 1_i32, - "error: cannot write non-utf8 data", - )); - } - - Ok(()) - } - Err(_) => Err(USimpleError::new(1_i32, "error: invalid input")), - }; - - return result; + // spell-checker:disable-next-line + let alphabet = b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#"; + + fast_decode::fast_decode( + input, + ( + ZEightFiveWrapper {}, + Z85_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE, + alphabet, + ), + ignore_garbage, + )?; + + return Ok(()); } // For these, use faster, new decoding logic Format::Base16 => ( HEXUPPER, - BASE16_DECODE_IN_CHUNKS_OF_SIZE, + BASE16_VALID_DECODING_MULTIPLE, + // spell-checker:disable-next-line b"0123456789ABCDEF", ), - Format::Base2Lsbf => (BASE2LSBF, BASE2_DECODE_IN_CHUNKS_OF_SIZE, b"01"), - Format::Base2Msbf => (BASE2MSBF, BASE2_DECODE_IN_CHUNKS_OF_SIZE, b"01"), + Format::Base2Lsbf => ( + BASE2LSBF, + BASE2_VALID_DECODING_MULTIPLE, + // spell-checker:disable-next-line + b"01", + ), + Format::Base2Msbf => ( + BASE2MSBF, + BASE2_VALID_DECODING_MULTIPLE, + // spell-checker:disable-next-line + b"01", + ), Format::Base32 => ( BASE32, - BASE32_DECODE_IN_CHUNKS_OF_SIZE, + BASE32_VALID_DECODING_MULTIPLE, + // spell-checker:disable-next-line b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=", ), Format::Base32Hex => ( BASE32HEX, - BASE32_DECODE_IN_CHUNKS_OF_SIZE, + BASE32_VALID_DECODING_MULTIPLE, // spell-checker:disable-next-line b"0123456789ABCDEFGHIJKLMNOPQRSTUV=", ), Format::Base64 => ( BASE64, - BASE64_DECODE_IN_CHUNKS_OF_SIZE, + BASE64_VALID_DECODING_MULTIPLE, + // spell-checker:disable-next-line b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=+/", ), Format::Base64Url => ( BASE64URL, - BASE64_DECODE_IN_CHUNKS_OF_SIZE, + BASE64_VALID_DECODING_MULTIPLE, + // spell-checker:disable-next-line b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=_-", ), }; fast_decode::fast_decode( input, - encoding_and_decode_in_chunks_of_size_and_alphabet, + ( + encoding, + valid_decoding_multiple * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE, + alphabet, + ), ignore_garbage, )?; Ok(()) } else { - let encoding_and_encode_in_chunks_of_size = match format { - // Use naive approach for Z85, since the crate being used doesn't have the API needed + let (encoding, un_padded_multiple) = match format { + // Use naive approach for Z85 (now only semi-naive), since the crate being used doesn't have the API + // needed Format::Z85 => { - let result = match encode_z_eight_five(input) { - Ok(st) => { - wrap_print(&st, wrap.unwrap_or(WRAP_DEFAULT))?; - - Ok(()) - } - Err(EncodeError::InvalidInput) => { - Err(USimpleError::new(1_i32, "error: invalid input")) - } - Err(_) => Err(USimpleError::new( - 1_i32, - "error: invalid input (length must be multiple of 4 characters)", - )), - }; - - return result; + fast_encode::fast_encode( + input, + ( + ZEightFiveWrapper {}, + Z85_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE, + ), + wrap, + )?; + + return Ok(()); } // For these, use faster, new encoding logic - Format::Base16 => (HEXUPPER, BASE16_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base2Lsbf => (BASE2LSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base2Msbf => (BASE2MSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base32 => (BASE32, BASE32_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base32Hex => (BASE32HEX, BASE32_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base64 => (BASE64, BASE64_ENCODE_IN_CHUNKS_OF_SIZE), - Format::Base64Url => (BASE64URL, BASE64_ENCODE_IN_CHUNKS_OF_SIZE), + Format::Base16 => (HEXUPPER, BASE16_UN_PADDED_MULTIPLE), + Format::Base2Lsbf => (BASE2LSBF, BASE2_UN_PADDED_MULTIPLE), + Format::Base2Msbf => (BASE2MSBF, BASE2_UN_PADDED_MULTIPLE), + Format::Base32 => (BASE32, BASE32_UN_PADDED_MULTIPLE), + Format::Base32Hex => (BASE32HEX, BASE32_UN_PADDED_MULTIPLE), + Format::Base64 => (BASE64, BASE64_UN_PADDED_MULTIPLE), + Format::Base64Url => (BASE64URL, BASE64_UN_PADDED_MULTIPLE), }; - fast_encode::fast_encode(input, encoding_and_encode_in_chunks_of_size, wrap)?; + fast_encode::fast_encode( + input, + ( + encoding, + un_padded_multiple * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE, + ), + wrap, + )?; Ok(()) } @@ -306,29 +308,18 @@ mod fast_encode { use std::{ collections::VecDeque, io::{self, ErrorKind, Read, StdoutLock, Write}, + num::{NonZero, NonZeroUsize}, }; use uucore::{ - encoding::for_fast_encode::Encoding, + encoding::SupportsFastEncode, error::{UResult, USimpleError}, }; struct LineWrapping { - line_length: usize, + line_length: NonZeroUsize, print_buffer: Vec, } - // Start of helper functions - // Adapted from `encode_append` in the "data-encoding" crate - fn encode_append_vec_deque(encoding: &Encoding, input: &[u8], output: &mut VecDeque) { - let output_len = output.len(); - - output.resize(output_len + encoding.encode_len(input.len()), 0_u8); - - let make_contiguous_result = output.make_contiguous(); - - encoding.encode_mut(input, &mut (make_contiguous_result[output_len..])); - } - fn write_without_line_breaks( encoded_buffer: &mut VecDeque, stdout_lock: &mut StdoutLock, @@ -358,9 +349,7 @@ mod fast_encode { stdout_lock: &mut StdoutLock, is_cleanup: bool, ) -> io::Result<()> { - let line_length_usize = *line_length; - - assert!(line_length_usize > 0_usize); + let line_length_usize = line_length.get(); let make_contiguous_result = encoded_buffer.make_contiguous(); @@ -420,9 +409,9 @@ mod fast_encode { // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L1710 // /// `encoding` and `encode_in_chunks_of_size` are passed in a tuple to indicate that they are logically tied - pub fn fast_encode( + pub fn fast_encode( input: &mut R, - (encoding, encode_in_chunks_of_size): (Encoding, usize), + (supports_fast_encode, encode_in_chunks_of_size): (S, usize), line_wrap: Option, ) -> UResult<()> { /// Rust uses 8 kibibytes @@ -435,12 +424,12 @@ mod fast_encode { Some(0_usize) => None, // A custom line wrapping value was passed Some(an) => Some(LineWrapping { - line_length: an, + line_length: NonZero::new(an).unwrap(), print_buffer: Vec::::new(), }), // Line wrapping was not set, so the default is used None => Some(LineWrapping { - line_length: WRAP_DEFAULT, + line_length: NonZero::new(WRAP_DEFAULT).unwrap(), print_buffer: Vec::::new(), }), }; @@ -493,11 +482,10 @@ mod fast_encode { // Encode the old unencoded data and the stolen bytes, and add the result to // `encoded_buffer` - encode_append_vec_deque( - &encoding, + supports_fast_encode.encode_to_vec_deque( leftover_buffer.make_contiguous(), &mut encoded_buffer, - ); + )?; // Reset `leftover_buffer` leftover_buffer.clear(); @@ -515,7 +503,7 @@ mod fast_encode { for sl in chunks_exact { assert!(sl.len() == encode_in_chunks_of_size); - encode_append_vec_deque(&encoding, sl, &mut encoded_buffer); + supports_fast_encode.encode_to_vec_deque(sl, &mut encoded_buffer)?; } leftover_buffer.extend(remainder); @@ -544,11 +532,8 @@ mod fast_encode { // `input` has finished producing data, so the data remaining in the buffers needs to be encoded and printed { // Encode all remaining unencoded bytes, placing them in `encoded_buffer` - encode_append_vec_deque( - &encoding, - leftover_buffer.make_contiguous(), - &mut encoded_buffer, - ); + supports_fast_encode + .encode_to_vec_deque(leftover_buffer.make_contiguous(), &mut encoded_buffer)?; // Write all data in `encoded_buffer` to stdout // `is_cleanup` triggers special cleanup-only logic @@ -567,41 +552,45 @@ mod fast_encode { mod fast_decode { use std::io::{self, ErrorKind, Read, StdoutLock, Write}; use uucore::{ - encoding::{alphabet_to_table, for_fast_encode::Encoding}, + encoding::SupportsFastDecode, error::{UResult, USimpleError}, }; - struct FilteringData { - table: [bool; 256_usize], - } - // Start of helper functions - // Adapted from `decode` in the "data-encoding" crate - fn decode_into_vec(encoding: &Encoding, input: &[u8], output: &mut Vec) -> UResult<()> { - let decode_len_result = match encoding.decode_len(input.len()) { - Ok(us) => us, - Err(de) => { - return Err(USimpleError::new(1_i32, format!("{de}"))); - } - }; + pub fn alphabet_to_table(alphabet: &[u8], ignore_garbage: bool) -> [bool; 256_usize] { + // If "ignore_garbage" is enabled, all characters outside the alphabet are ignored + // If it is not enabled, only '\n' and '\r' are ignored + if ignore_garbage { + // Note: "false" here + let mut table = [false; 256_usize]; - let output_len = output.len(); + // Pass through no characters except those in the alphabet + for ue in alphabet { + let us = usize::from(*ue); - output.resize(output_len + decode_len_result, 0_u8); + // Should not have been set yet + assert!(!table[us]); - match encoding.decode_mut(input, &mut (output[output_len..])) { - Ok(us) => { - // See: - // https://docs.rs/data-encoding/latest/data_encoding/struct.Encoding.html#method.decode_mut - // "Returns the length of the decoded output. This length may be smaller than the output length if the input contained padding or ignored characters. The output bytes after the returned length are not initialized and should not be read." - output.truncate(output_len + us); + table[us] = true; } - Err(_de) => { - return Err(USimpleError::new(1_i32, "error: invalid input".to_owned())); + + table + } else { + // Note: "true" here + let mut table = [true; 256_usize]; + + // Pass through all characters except '\n' and '\r' to + for ue in [b'\n', b'\r'] { + let us = usize::from(ue); + + // Should not have been set yet + assert!(table[us]); + + table[us] = false; } - } - Ok(()) + table + } } fn write_to_stdout( @@ -619,9 +608,9 @@ mod fast_decode { /// `encoding`, `decode_in_chunks_of_size`, and `alphabet` are passed in a tuple to indicate that they are /// logically tied - pub fn fast_decode( + pub fn fast_decode( input: &mut R, - (encoding, decode_in_chunks_of_size, alphabet): (Encoding, usize, &[u8]), + (supports_fast_decode, decode_in_chunks_of_size, alphabet): (S, usize, &[u8]), ignore_garbage: bool, ) -> UResult<()> { /// Rust uses 8 kibibytes @@ -634,30 +623,12 @@ mod fast_decode { // passing it to "data-encoding", so there is no point in doing any filtering in "data-encoding". This also // allows execution to stay on the happy path in "data-encoding": // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L754-L756 - let (encoding_to_use, filter_data_option) = { - if ignore_garbage { - // Note that the alphabet constants above already include the padding characters - // TODO - // Precompute this - let table = alphabet_to_table(alphabet); - - (encoding, Some(FilteringData { table })) - } else { - let mut sp = encoding.specification(); - - // '\n' and '\r' are always ignored - sp.ignore = "\n\r".to_owned(); - - let en = match sp.encoding() { - Ok(en) => en, - Err(sp) => { - return Err(USimpleError::new(1_i32, format!("{sp}"))); - } - }; - - (en, None) - } - }; + // Update: it is not even worth it to use "data-encoding"'s ignore functionality when "ignore_garbage" is + // false. + // Note that the alphabet constants above already include the padding characters + // TODO + // Precompute this + let table = alphabet_to_table(alphabet, ignore_garbage); // Start of buffers // Data that was read from stdin @@ -689,40 +660,24 @@ mod fast_decode { // The part of `input_buffer` that was actually filled by the call to `read` let read_buffer = &input_buffer[..bytes_read_from_input]; - if let Some(fi) = &filter_data_option { - let FilteringData { table } = fi; + // First just scan the data for the happy path + // Note: this happy path check has not been validated with performance testing + let found_garbage = read_buffer.iter().any(|ue| { + // Garbage, since it was not found in the table + !table[usize::from(*ue)] + }); - let table_to_owned = table.to_owned(); - - // First just scan the data for the happy path - // Note: this happy path check has not been validated with performance testing - let mut found_garbage = false; + if found_garbage { + non_garbage_buffer.clear(); for ue in read_buffer { - if table_to_owned[usize::from(*ue)] { + if table[usize::from(*ue)] { // Not garbage, since it was found in the table - continue; - } else { - found_garbage = true; - - break; + non_garbage_buffer.push(*ue); } } - if found_garbage { - non_garbage_buffer.clear(); - - for ue in read_buffer { - if table_to_owned[usize::from(*ue)] { - // Not garbage, since it was found in the table - non_garbage_buffer.push(*ue); - } - } - - non_garbage_buffer.as_slice() - } else { - read_buffer - } + non_garbage_buffer.as_slice() } else { read_buffer } @@ -751,11 +706,8 @@ mod fast_decode { // Decode the old un-decoded data and the stolen bytes, and add the result to // `decoded_buffer` - decode_into_vec( - &encoding_to_use, - &leftover_buffer, - &mut decoded_buffer, - )?; + supports_fast_decode + .decode_into_vec(&leftover_buffer, &mut decoded_buffer)?; // Reset `leftover_buffer` leftover_buffer.clear(); @@ -773,7 +725,7 @@ mod fast_decode { for sl in chunks_exact { assert!(sl.len() == decode_in_chunks_of_size); - decode_into_vec(&encoding_to_use, sl, &mut decoded_buffer)?; + supports_fast_decode.decode_into_vec(sl, &mut decoded_buffer)?; } leftover_buffer.extend(remainder); @@ -797,7 +749,7 @@ mod fast_decode { // `input` has finished producing data, so the data remaining in the buffers needs to be decoded and printed { // Decode all remaining encoded bytes, placing them in `decoded_buffer` - decode_into_vec(&encoding_to_use, &leftover_buffer, &mut decoded_buffer)?; + supports_fast_decode.decode_into_vec(&leftover_buffer, &mut decoded_buffer)?; // Write all data in `decoded_buffer` to stdout write_to_stdout(&mut decoded_buffer, &mut stdout_lock)?; diff --git a/src/uu/cksum/src/cksum.rs b/src/uu/cksum/src/cksum.rs index b9654ca8654..3b1057d7015 100644 --- a/src/uu/cksum/src/cksum.rs +++ b/src/uu/cksum/src/cksum.rs @@ -111,7 +111,7 @@ where OutputFormat::Hexadecimal => sum_hex, OutputFormat::Base64 => match options.algo_name { ALGORITHM_OPTIONS_CRC | ALGORITHM_OPTIONS_SYSV | ALGORITHM_OPTIONS_BSD => sum_hex, - _ => encoding::encode_base_six_four(&hex::decode(sum_hex).unwrap()), + _ => encoding::for_cksum::BASE64.encode(&hex::decode(sum_hex).unwrap()), }, }; // The BSD checksum output is 5 digit integer diff --git a/src/uucore/Cargo.toml b/src/uucore/Cargo.toml index dd48d1b4fcf..4774180e7eb 100644 --- a/src/uucore/Cargo.toml +++ b/src/uucore/Cargo.toml @@ -77,7 +77,7 @@ default = [] backup-control = [] colors = [] checksum = ["data-encoding", "thiserror", "regex", "sum"] -encoding = ["data-encoding", "data-encoding-macro", "z85", "thiserror"] +encoding = ["data-encoding", "data-encoding-macro", "z85"] entries = ["libc"] fs = ["dunce", "libc", "winapi-util", "windows-sys"] fsext = ["libc", "windows-sys"] diff --git a/src/uucore/src/lib/features/encoding.rs b/src/uucore/src/lib/features/encoding.rs index 8befec8ef62..c4831e032ff 100644 --- a/src/uucore/src/lib/features/encoding.rs +++ b/src/uucore/src/lib/features/encoding.rs @@ -3,25 +3,20 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (strings) ABCDEFGHIJKLMNOPQRSTUVWXYZ ABCDEFGHIJKLMNOPQRSTUV // spell-checker:ignore (encodings) lsbf msbf -use data_encoding::{Encoding, BASE64}; +use crate::error::{UResult, USimpleError}; +use data_encoding::Encoding; use data_encoding_macro::new_encoding; -use std::{ - error::Error, - io::{self, Read, Write}, -}; +use std::collections::VecDeque; // Re-export for the faster encoding logic pub mod for_fast_encode { pub use data_encoding::*; } -#[derive(Debug)] -pub enum EncodeError { - Z85InputLenNotMultipleOf4, - InvalidInput, +pub mod for_cksum { + pub use data_encoding::BASE64; } #[derive(Clone, Copy)] @@ -46,96 +41,104 @@ pub const BASE2MSBF: Encoding = new_encoding! { bit_order: MostSignificantFirst, }; -pub fn encode_base_six_four(input: &[u8]) -> String { - BASE64.encode(input) -} +pub struct ZEightFiveWrapper {} -pub fn decode_z_eight_five( - mut input: R, - ignore_garbage: bool, -) -> Result, Box> { - const Z_EIGHT_FIVE_ALPHABET: &[u8; 85_usize] = - b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#"; +pub trait SupportsFastEncode { + fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque) -> UResult<()>; +} - let mut buf = Vec::::new(); +impl SupportsFastEncode for ZEightFiveWrapper { + fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque) -> UResult<()> { + // According to the spec we should not accept inputs whose len is not a multiple of 4. + // However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them. + if input.len() % 4_usize != 0_usize { + return Err(USimpleError::new( + 1_i32, + "error: invalid input (length must be multiple of 4 characters)".to_owned(), + )); + } - input.read_to_end(&mut buf)?; + let string = z85::encode(input); - if ignore_garbage { - let table = alphabet_to_table(Z_EIGHT_FIVE_ALPHABET); + output.extend(string.as_bytes()); - buf.retain(|&ue| table[usize::from(ue)]); - } else { - buf.retain(|&ue| ue != b'\n' && ue != b'\r'); - }; + Ok(()) + } +} - // The z85 crate implements a padded encoding by using a leading '#' which is otherwise not allowed. - // We manually check for a leading '#' and return an error ourselves. - let vec = if buf.starts_with(b"#") { - return Err(Box::from("'#' character at index 0 is invalid".to_owned())); - } else { - z85::decode(buf)? - }; +impl SupportsFastEncode for Encoding { + // Adapted from `encode_append` in the "data-encoding" crate + fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque) -> UResult<()> { + let output_len = output.len(); - Ok(vec) -} + output.resize(output_len + self.encode_len(input.len()), 0_u8); -pub fn encode_z_eight_five(mut input: R) -> Result { - let mut buf = Vec::::new(); + let make_contiguous_result = output.make_contiguous(); - match input.read_to_end(&mut buf) { - Ok(_) => { - let buf_slice = buf.as_slice(); + self.encode_mut(input, &mut (make_contiguous_result[output_len..])); - // According to the spec we should not accept inputs whose len is not a multiple of 4. - // However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them. - if buf_slice.len() % 4_usize == 0_usize { - Ok(z85::encode(buf_slice)) - } else { - Err(EncodeError::Z85InputLenNotMultipleOf4) - } - } - Err(_) => Err(EncodeError::InvalidInput), + Ok(()) } } -pub fn wrap_print(res: &str, line_wrap: usize) -> io::Result<()> { - let stdout = io::stdout(); - - let mut stdout_lock = stdout.lock(); - - if line_wrap == 0 { - stdout_lock.write_all(res.as_bytes())?; - } else { - let res_len = res.len(); - - let mut start = 0; +pub trait SupportsFastDecode { + fn decode_into_vec(&self, input: &[u8], output: &mut Vec) -> UResult<()>; +} - while start < res_len { - let start_plus_line_wrap = start + line_wrap; +impl SupportsFastDecode for ZEightFiveWrapper { + fn decode_into_vec(&self, input: &[u8], output: &mut Vec) -> UResult<()> { + if input.first() == Some(&b'#') { + return Err(USimpleError::new(1_i32, "error: invalid input".to_owned())); + } - let end = start_plus_line_wrap.min(res_len); + // According to the spec we should not accept inputs whose len is not a multiple of 4. + // However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them. + if input.len() % 4_usize != 0_usize { + return Err(USimpleError::new( + 1_i32, + "error: invalid input (length must be multiple of 4 characters)".to_owned(), + )); + }; + + let decode_result = match z85::decode(input) { + Ok(ve) => ve, + Err(_de) => { + return Err(USimpleError::new(1_i32, "error: invalid input".to_owned())); + } + }; - writeln!(stdout_lock, "{}", &res[start..end])?; + output.extend_from_slice(&decode_result); - start = end; - } + Ok(()) } - - Ok(()) } -pub fn alphabet_to_table(alphabet: &[u8]) -> [bool; 256_usize] { - let mut table = [false; 256_usize]; +impl SupportsFastDecode for Encoding { + // Adapted from `decode` in the "data-encoding" crate + fn decode_into_vec(&self, input: &[u8], output: &mut Vec) -> UResult<()> { + let decode_len_result = match self.decode_len(input.len()) { + Ok(us) => us, + Err(_de) => { + return Err(USimpleError::new(1_i32, "error: invalid input".to_owned())); + } + }; - for ue in alphabet { - let us = usize::from(*ue); + let output_len = output.len(); - // Should not have been set yet - assert!(!table[us]); + output.resize(output_len + decode_len_result, 0_u8); - table[us] = true; - } + match self.decode_mut(input, &mut (output[output_len..])) { + Ok(us) => { + // See: + // https://docs.rs/data-encoding/latest/data_encoding/struct.Encoding.html#method.decode_mut + // "Returns the length of the decoded output. This length may be smaller than the output length if the input contained padding or ignored characters. The output bytes after the returned length are not initialized and should not be read." + output.truncate(output_len + us); + } + Err(_de) => { + return Err(USimpleError::new(1_i32, "error: invalid input".to_owned())); + } + } - table + Ok(()) + } } diff --git a/tests/by-util/test_basenc.rs b/tests/by-util/test_basenc.rs index fa49bd9714f..8701911f7fa 100644 --- a/tests/by-util/test_basenc.rs +++ b/tests/by-util/test_basenc.rs @@ -7,13 +7,18 @@ use crate::common::util::TestScenario; #[test] -fn test_z85_not_padded() { +fn test_z85_not_padded_decode() { // The z85 crate deviates from the standard in some cases; we have to catch those new_ucmd!() .args(&["--z85", "-d"]) .pipe_in("##########") .fails() .stderr_only("basenc: error: invalid input\n"); +} + +#[test] +fn test_z85_not_padded_encode() { + // The z85 crate deviates from the standard in some cases; we have to catch those new_ucmd!() .args(&["--z85"]) .pipe_in("123") From 2d09037c6799cf042d872ed30548edb893d6852e Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sun, 22 Sep 2024 08:36:16 -0500 Subject: [PATCH 09/17] Fix MSRV issue --- src/uu/base32/src/base_common.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index a9990757d5d..6de8717a2e3 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -308,7 +308,7 @@ mod fast_encode { use std::{ collections::VecDeque, io::{self, ErrorKind, Read, StdoutLock, Write}, - num::{NonZero, NonZeroUsize}, + num::NonZeroUsize, }; use uucore::{ encoding::SupportsFastEncode, @@ -424,12 +424,12 @@ mod fast_encode { Some(0_usize) => None, // A custom line wrapping value was passed Some(an) => Some(LineWrapping { - line_length: NonZero::new(an).unwrap(), + line_length: NonZeroUsize::new(an).unwrap(), print_buffer: Vec::::new(), }), // Line wrapping was not set, so the default is used None => Some(LineWrapping { - line_length: NonZero::new(WRAP_DEFAULT).unwrap(), + line_length: NonZeroUsize::new(WRAP_DEFAULT).unwrap(), print_buffer: Vec::::new(), }), }; @@ -579,7 +579,7 @@ mod fast_decode { // Note: "true" here let mut table = [true; 256_usize]; - // Pass through all characters except '\n' and '\r' to + // Pass through all characters except '\n' and '\r' for ue in [b'\n', b'\r'] { let us = usize::from(ue); From ccb873519c663da65b726e30e122bf333765801d Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sun, 22 Sep 2024 20:30:15 -0500 Subject: [PATCH 10/17] Improve performance --- src/uu/base32/src/base_common.rs | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index 6de8717a2e3..9947f1e9c21 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -7,7 +7,7 @@ use clap::{crate_version, Arg, ArgAction, Command}; use std::fs::File; -use std::io::{BufReader, Read, Stdin}; +use std::io::{Read, Stdin}; use std::path::Path; use uucore::display::Quotable; use uucore::encoding::{ @@ -141,13 +141,12 @@ pub fn base_app(about: &'static str, usage: &str) -> Command { pub fn get_input<'a>(config: &Config, stdin_ref: &'a Stdin) -> UResult> { match &config.to_read { Some(name) => { + // Do not buffer input, because buffering is handled by `fast_decode` and `fast_encode` let file_buf = File::open(Path::new(name)).map_err_context(|| name.maybe_quote().to_string())?; - Ok(Box::new(BufReader::new(file_buf))) // as Box - } - None => { - Ok(Box::new(stdin_ref.lock())) // as Box + Ok(Box::new(file_buf)) } + None => Ok(Box::new(stdin_ref.lock())), } } @@ -414,10 +413,8 @@ mod fast_encode { (supports_fast_encode, encode_in_chunks_of_size): (S, usize), line_wrap: Option, ) -> UResult<()> { - /// Rust uses 8 kibibytes - /// - /// https://github.com/rust-lang/rust/blob/1a5a2240bc1b8cf0bcce7acb946c78d6493a4fd3/library/std/src/sys_common/io.rs#L3 - const INPUT_BUFFER_SIZE: usize = 8_usize * 1_024_usize; + // Based on performance testing + const INPUT_BUFFER_SIZE: usize = 32_usize * 1_024_usize; let mut line_wrapping_option = match line_wrap { // Line wrapping is disabled because "-w"/"--wrap" was passed with "0" @@ -613,10 +610,8 @@ mod fast_decode { (supports_fast_decode, decode_in_chunks_of_size, alphabet): (S, usize, &[u8]), ignore_garbage: bool, ) -> UResult<()> { - /// Rust uses 8 kibibytes - /// - /// https://github.com/rust-lang/rust/blob/1a5a2240bc1b8cf0bcce7acb946c78d6493a4fd3/library/std/src/sys_common/io.rs#L3 - const INPUT_BUFFER_SIZE: usize = 8_usize * 1_024_usize; + // Based on performance testing + const INPUT_BUFFER_SIZE: usize = 32_usize * 1_024_usize; // Note that it's not worth using "data-encoding"'s ignore functionality if "ignore_garbage" is true, because // "data-encoding"'s ignore functionality cannot discard non-ASCII bytes. The data has to be filtered before @@ -661,7 +656,7 @@ mod fast_decode { let read_buffer = &input_buffer[..bytes_read_from_input]; // First just scan the data for the happy path - // Note: this happy path check has not been validated with performance testing + // Yields significant speedup when the input does not contain line endings let found_garbage = read_buffer.iter().any(|ue| { // Garbage, since it was not found in the table !table[usize::from(*ue)] From 4f19c0e9ffcfde474291bcc39f487b2437e909df Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Tue, 24 Sep 2024 07:49:28 -0500 Subject: [PATCH 11/17] Address PR comments. Simplify structure. --- src/uu/base32/src/base_common.rs | 495 ++++++++++++------------ src/uucore/src/lib/features/encoding.rs | 156 ++++++-- tests/by-util/test_basenc.rs | 4 +- 3 files changed, 361 insertions(+), 294 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index 9947f1e9c21..5bee873dad7 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -3,28 +3,29 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore HEXUPPER Lsbf Msbf +// spell-checker:ignore hexupper lsbf msbf unpadded use clap::{crate_version, Arg, ArgAction, Command}; use std::fs::File; -use std::io::{Read, Stdin}; +use std::io::{ErrorKind, Read, Stdin}; use std::path::Path; use uucore::display::Quotable; use uucore::encoding::{ for_fast_encode::{BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER}, - Format, ZEightFiveWrapper, BASE2LSBF, BASE2MSBF, + Format, Z85Wrapper, BASE2LSBF, BASE2MSBF, }; +use uucore::encoding::{EncodingWrapper, SupportsFastDecodeAndEncode}; use uucore::error::{FromIo, UResult, USimpleError, UUsageError}; use uucore::format_usage; -pub const BASE_CMD_PARSE_ERROR: i32 = 1_i32; +pub const BASE_CMD_PARSE_ERROR: i32 = 1; /// Encoded output will be formatted in lines of this length (the last line can be shorter) /// /// Other implementations default to 76 /// /// This default is only used if no "-w"/"--wrap" argument is passed -const WRAP_DEFAULT: usize = 76_usize; +const WRAP_DEFAULT: usize = 76; pub struct Config { pub decode: bool, @@ -157,160 +158,91 @@ pub fn handle_input( ignore_garbage: bool, decode: bool, ) -> UResult<()> { - const DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024_usize; - - // These constants indicate that inputs with lengths divisible by these numbers will have no padding characters - // after encoding. - // For instance: - // "The quick brown" - // is 15 characters (divisible by 3), so it is encoded in Base64 without padding: - // "VGhlIHF1aWNrIGJyb3du" - // While: - // "The quick brown fox" - // is 19 characters, which is not divisible by 3, so its Base64 representation has padding: - // "VGhlIHF1aWNrIGJyb3duIGZveA==" - // - // The encoding performed by `fast_encode` depend on these constants being correct. Performance can be tuned by - // multiplying these numbers by a different multiple (see `DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE` above). - const BASE16_UN_PADDED_MULTIPLE: usize = 1_usize; - const BASE2_UN_PADDED_MULTIPLE: usize = 1_usize; - const BASE32_UN_PADDED_MULTIPLE: usize = 5_usize; - const BASE64_UN_PADDED_MULTIPLE: usize = 3_usize; - const Z85_UN_PADDED_MULTIPLE: usize = 4_usize; - - // Similar to above, but for decoding - const BASE16_VALID_DECODING_MULTIPLE: usize = 2_usize; - const BASE2_VALID_DECODING_MULTIPLE: usize = 8_usize; - const BASE32_VALID_DECODING_MULTIPLE: usize = 8_usize; - const BASE64_VALID_DECODING_MULTIPLE: usize = 4_usize; - const Z85_VALID_DECODING_MULTIPLE: usize = 5_usize; + const BASE16_VALID_DECODING_MULTIPLE: usize = 2; + const BASE2_VALID_DECODING_MULTIPLE: usize = 8; + const BASE32_VALID_DECODING_MULTIPLE: usize = 8; + const BASE64_VALID_DECODING_MULTIPLE: usize = 4; + + const BASE16_UNPADDED_MULTIPLE: usize = 1; + const BASE2_UNPADDED_MULTIPLE: usize = 1; + const BASE32_UNPADDED_MULTIPLE: usize = 5; + const BASE64_UNPADDED_MULTIPLE: usize = 3; + + let supports_fast_decode_and_encode: Box = match format { + Format::Base16 => Box::from(EncodingWrapper::new( + HEXUPPER, + BASE16_VALID_DECODING_MULTIPLE, + BASE16_UNPADDED_MULTIPLE, + // spell-checker:disable-next-line + b"0123456789ABCDEF", + )), + Format::Base2Lsbf => Box::from(EncodingWrapper::new( + BASE2LSBF, + BASE2_VALID_DECODING_MULTIPLE, + BASE2_UNPADDED_MULTIPLE, + // spell-checker:disable-next-line + b"01", + )), + Format::Base2Msbf => Box::from(EncodingWrapper::new( + BASE2MSBF, + BASE2_VALID_DECODING_MULTIPLE, + BASE2_UNPADDED_MULTIPLE, + // spell-checker:disable-next-line + b"01", + )), + Format::Base32 => Box::from(EncodingWrapper::new( + BASE32, + BASE32_VALID_DECODING_MULTIPLE, + BASE32_UNPADDED_MULTIPLE, + // spell-checker:disable-next-line + b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=", + )), + Format::Base32Hex => Box::from(EncodingWrapper::new( + BASE32HEX, + BASE32_VALID_DECODING_MULTIPLE, + BASE32_UNPADDED_MULTIPLE, + // spell-checker:disable-next-line + b"0123456789ABCDEFGHIJKLMNOPQRSTUV=", + )), + Format::Base64 => Box::from(EncodingWrapper::new( + BASE64, + BASE64_VALID_DECODING_MULTIPLE, + BASE64_UNPADDED_MULTIPLE, + // spell-checker:disable-next-line + b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=+/", + )), + Format::Base64Url => Box::from(EncodingWrapper::new( + BASE64URL, + BASE64_VALID_DECODING_MULTIPLE, + BASE64_UNPADDED_MULTIPLE, + // spell-checker:disable-next-line + b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=_-", + )), + Format::Z85 => Box::from(Z85Wrapper {}), + }; if decode { - let (encoding, valid_decoding_multiple, alphabet): (_, _, &[u8]) = match format { - // Use naive approach (now only semi-naive) for Z85, since the crate being used doesn't have the API - // needed - Format::Z85 => { - // spell-checker:disable-next-line - let alphabet = b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#"; - - fast_decode::fast_decode( - input, - ( - ZEightFiveWrapper {}, - Z85_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE, - alphabet, - ), - ignore_garbage, - )?; - - return Ok(()); - } - - // For these, use faster, new decoding logic - Format::Base16 => ( - HEXUPPER, - BASE16_VALID_DECODING_MULTIPLE, - // spell-checker:disable-next-line - b"0123456789ABCDEF", - ), - Format::Base2Lsbf => ( - BASE2LSBF, - BASE2_VALID_DECODING_MULTIPLE, - // spell-checker:disable-next-line - b"01", - ), - Format::Base2Msbf => ( - BASE2MSBF, - BASE2_VALID_DECODING_MULTIPLE, - // spell-checker:disable-next-line - b"01", - ), - Format::Base32 => ( - BASE32, - BASE32_VALID_DECODING_MULTIPLE, - // spell-checker:disable-next-line - b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=", - ), - Format::Base32Hex => ( - BASE32HEX, - BASE32_VALID_DECODING_MULTIPLE, - // spell-checker:disable-next-line - b"0123456789ABCDEFGHIJKLMNOPQRSTUV=", - ), - Format::Base64 => ( - BASE64, - BASE64_VALID_DECODING_MULTIPLE, - // spell-checker:disable-next-line - b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=+/", - ), - Format::Base64Url => ( - BASE64URL, - BASE64_VALID_DECODING_MULTIPLE, - // spell-checker:disable-next-line - b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=_-", - ), - }; - fast_decode::fast_decode( input, - ( - encoding, - valid_decoding_multiple * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE, - alphabet, - ), + supports_fast_decode_and_encode.as_ref(), ignore_garbage, )?; - - Ok(()) } else { - let (encoding, un_padded_multiple) = match format { - // Use naive approach for Z85 (now only semi-naive), since the crate being used doesn't have the API - // needed - Format::Z85 => { - fast_encode::fast_encode( - input, - ( - ZEightFiveWrapper {}, - Z85_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE, - ), - wrap, - )?; - - return Ok(()); - } - - // For these, use faster, new encoding logic - Format::Base16 => (HEXUPPER, BASE16_UN_PADDED_MULTIPLE), - Format::Base2Lsbf => (BASE2LSBF, BASE2_UN_PADDED_MULTIPLE), - Format::Base2Msbf => (BASE2MSBF, BASE2_UN_PADDED_MULTIPLE), - Format::Base32 => (BASE32, BASE32_UN_PADDED_MULTIPLE), - Format::Base32Hex => (BASE32HEX, BASE32_UN_PADDED_MULTIPLE), - Format::Base64 => (BASE64, BASE64_UN_PADDED_MULTIPLE), - Format::Base64Url => (BASE64URL, BASE64_UN_PADDED_MULTIPLE), - }; - - fast_encode::fast_encode( - input, - ( - encoding, - un_padded_multiple * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE, - ), - wrap, - )?; - - Ok(()) + fast_encode::fast_encode(input, supports_fast_decode_and_encode.as_ref(), wrap)?; } + + Ok(()) } mod fast_encode { - use crate::base_common::WRAP_DEFAULT; + use crate::base_common::{format_read_error, WRAP_DEFAULT}; use std::{ collections::VecDeque, io::{self, ErrorKind, Read, StdoutLock, Write}, num::NonZeroUsize, }; use uucore::{ - encoding::SupportsFastEncode, + encoding::SupportsFastDecodeAndEncode, error::{UResult, USimpleError}, }; @@ -319,6 +251,52 @@ mod fast_encode { print_buffer: Vec, } + // Start of helper functions + fn encode_in_chunks_to_buffer( + supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode, + encode_in_chunks_of_size: usize, + bytes_to_steal: usize, + read_buffer: &[u8], + encoded_buffer: &mut VecDeque, + leftover_buffer: &mut VecDeque, + ) -> UResult<()> { + let bytes_to_chunk = if bytes_to_steal > 0 { + let (stolen_bytes, rest_of_read_buffer) = read_buffer.split_at(bytes_to_steal); + + leftover_buffer.extend(stolen_bytes); + + // After appending the stolen bytes to `leftover_buffer`, it should be the right size + assert!(leftover_buffer.len() == encode_in_chunks_of_size); + + // Encode the old unencoded data and the stolen bytes, and add the result to + // `encoded_buffer` + supports_fast_decode_and_encode + .encode_to_vec_deque(leftover_buffer.make_contiguous(), encoded_buffer)?; + + // Reset `leftover_buffer` + leftover_buffer.clear(); + + rest_of_read_buffer + } else { + // Do not need to steal bytes from `read_buffer` + read_buffer + }; + + let chunks_exact = bytes_to_chunk.chunks_exact(encode_in_chunks_of_size); + + let remainder = chunks_exact.remainder(); + + for sl in chunks_exact { + assert!(sl.len() == encode_in_chunks_of_size); + + supports_fast_decode_and_encode.encode_to_vec_deque(sl, encoded_buffer)?; + } + + leftover_buffer.extend(remainder); + + Ok(()) + } + fn write_without_line_breaks( encoded_buffer: &mut VecDeque, stdout_lock: &mut StdoutLock, @@ -348,13 +326,13 @@ mod fast_encode { stdout_lock: &mut StdoutLock, is_cleanup: bool, ) -> io::Result<()> { - let line_length_usize = line_length.get(); + let line_length = line_length.get(); let make_contiguous_result = encoded_buffer.make_contiguous(); - let chunks_exact = make_contiguous_result.chunks_exact(line_length_usize); + let chunks_exact = make_contiguous_result.chunks_exact(line_length); - let mut bytes_added_to_print_buffer = 0_usize; + let mut bytes_added_to_print_buffer = 0; for sl in chunks_exact { bytes_added_to_print_buffer += sl.len(); @@ -400,25 +378,27 @@ mod fast_encode { } // End of helper functions - // TODO - // It turns out the crate being used already supports line wrapping: - // https://docs.rs/data-encoding/latest/data_encoding/struct.Specification.html#wrap-output-when-encoding-1 - // Check if that crate's line wrapping is faster than the wrapping being performed in this function - // Update: That crate does not support arbitrary width line wrapping. It only supports certain widths: - // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L1710 - // - /// `encoding` and `encode_in_chunks_of_size` are passed in a tuple to indicate that they are logically tied - pub fn fast_encode( + pub fn fast_encode( input: &mut R, - (supports_fast_encode, encode_in_chunks_of_size): (S, usize), + supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode, line_wrap: Option, ) -> UResult<()> { // Based on performance testing - const INPUT_BUFFER_SIZE: usize = 32_usize * 1_024_usize; + const INPUT_BUFFER_SIZE: usize = 32 * 1_024; + + const ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024; + + let encode_in_chunks_of_size = + supports_fast_decode_and_encode.unpadded_multiple() * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + assert!(encode_in_chunks_of_size > 0); + + // "data-encoding" supports line wrapping, but not arbitrary line wrapping, only certain widths, so line + // wrapping must be implemented here. + // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L1710 let mut line_wrapping_option = match line_wrap { // Line wrapping is disabled because "-w"/"--wrap" was passed with "0" - Some(0_usize) => None, + Some(0) => None, // A custom line wrapping value was passed Some(an) => Some(LineWrapping { line_length: NonZeroUsize::new(an).unwrap(), @@ -433,7 +413,7 @@ mod fast_encode { // Start of buffers // Data that was read from stdin - let mut input_buffer = vec![0_u8; INPUT_BUFFER_SIZE]; + let mut input_buffer = vec![0; INPUT_BUFFER_SIZE]; assert!(!input_buffer.is_empty()); @@ -449,7 +429,7 @@ mod fast_encode { loop { match input.read(&mut input_buffer) { Ok(bytes_read_from_input) => { - if bytes_read_from_input == 0_usize { + if bytes_read_from_input == 0 { break; } @@ -467,44 +447,14 @@ mod fast_encode { } // Encode data in chunks, then place it in `encoded_buffer` - { - let bytes_to_chunk = if bytes_to_steal > 0_usize { - let (stolen_bytes, rest_of_read_buffer) = - read_buffer.split_at(bytes_to_steal); - - leftover_buffer.extend(stolen_bytes); - - // After appending the stolen bytes to `leftover_buffer`, it should be the right size - assert!(leftover_buffer.len() == encode_in_chunks_of_size); - - // Encode the old unencoded data and the stolen bytes, and add the result to - // `encoded_buffer` - supports_fast_encode.encode_to_vec_deque( - leftover_buffer.make_contiguous(), - &mut encoded_buffer, - )?; - - // Reset `leftover_buffer` - leftover_buffer.clear(); - - rest_of_read_buffer - } else { - // Do not need to steal bytes from `read_buffer` - read_buffer - }; - - let chunks_exact = bytes_to_chunk.chunks_exact(encode_in_chunks_of_size); - - let remainder = chunks_exact.remainder(); - - for sl in chunks_exact { - assert!(sl.len() == encode_in_chunks_of_size); - - supports_fast_encode.encode_to_vec_deque(sl, &mut encoded_buffer)?; - } - - leftover_buffer.extend(remainder); - } + encode_in_chunks_to_buffer( + supports_fast_decode_and_encode, + encode_in_chunks_of_size, + bytes_to_steal, + read_buffer, + &mut encoded_buffer, + &mut leftover_buffer, + )?; // Write all data in `encoded_buffer` to stdout write_to_stdout( @@ -515,12 +465,14 @@ mod fast_encode { )?; } Err(er) => { - if er.kind() == ErrorKind::Interrupted { + let kind = er.kind(); + + if kind == ErrorKind::Interrupted { // TODO // Retry reading? } - return Err(USimpleError::new(1_i32, format!("read error: {er}"))); + return Err(USimpleError::new(1, format_read_error(kind))); } } } @@ -529,7 +481,7 @@ mod fast_encode { // `input` has finished producing data, so the data remaining in the buffers needs to be encoded and printed { // Encode all remaining unencoded bytes, placing them in `encoded_buffer` - supports_fast_encode + supports_fast_decode_and_encode .encode_to_vec_deque(leftover_buffer.make_contiguous(), &mut encoded_buffer)?; // Write all data in `encoded_buffer` to stdout @@ -547,19 +499,20 @@ mod fast_encode { } mod fast_decode { + use crate::base_common::format_read_error; use std::io::{self, ErrorKind, Read, StdoutLock, Write}; use uucore::{ - encoding::SupportsFastDecode, + encoding::SupportsFastDecodeAndEncode, error::{UResult, USimpleError}, }; // Start of helper functions - pub fn alphabet_to_table(alphabet: &[u8], ignore_garbage: bool) -> [bool; 256_usize] { + fn alphabet_to_table(alphabet: &[u8], ignore_garbage: bool) -> [bool; 256] { // If "ignore_garbage" is enabled, all characters outside the alphabet are ignored // If it is not enabled, only '\n' and '\r' are ignored if ignore_garbage { // Note: "false" here - let mut table = [false; 256_usize]; + let mut table = [false; 256]; // Pass through no characters except those in the alphabet for ue in alphabet { @@ -574,7 +527,7 @@ mod fast_decode { table } else { // Note: "true" here - let mut table = [true; 256_usize]; + let mut table = [true; 256]; // Pass through all characters except '\n' and '\r' for ue in [b'\n', b'\r'] { @@ -590,6 +543,51 @@ mod fast_decode { } } + fn decode_in_chunks_to_buffer( + supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode, + decode_in_chunks_of_size: usize, + bytes_to_steal: usize, + read_buffer_filtered: &[u8], + decoded_buffer: &mut Vec, + leftover_buffer: &mut Vec, + ) -> UResult<()> { + let bytes_to_chunk = if bytes_to_steal > 0 { + let (stolen_bytes, rest_of_read_buffer_filtered) = + read_buffer_filtered.split_at(bytes_to_steal); + + leftover_buffer.extend(stolen_bytes); + + // After appending the stolen bytes to `leftover_buffer`, it should be the right size + assert!(leftover_buffer.len() == decode_in_chunks_of_size); + + // Decode the old un-decoded data and the stolen bytes, and add the result to + // `decoded_buffer` + supports_fast_decode_and_encode.decode_into_vec(leftover_buffer, decoded_buffer)?; + + // Reset `leftover_buffer` + leftover_buffer.clear(); + + rest_of_read_buffer_filtered + } else { + // Do not need to steal bytes from `read_buffer` + read_buffer_filtered + }; + + let chunks_exact = bytes_to_chunk.chunks_exact(decode_in_chunks_of_size); + + let remainder = chunks_exact.remainder(); + + for sl in chunks_exact { + assert!(sl.len() == decode_in_chunks_of_size); + + supports_fast_decode_and_encode.decode_into_vec(sl, decoded_buffer)?; + } + + leftover_buffer.extend(remainder); + + Ok(()) + } + fn write_to_stdout( decoded_buffer: &mut Vec, stdout_lock: &mut StdoutLock, @@ -605,13 +603,21 @@ mod fast_decode { /// `encoding`, `decode_in_chunks_of_size`, and `alphabet` are passed in a tuple to indicate that they are /// logically tied - pub fn fast_decode( + pub fn fast_decode( input: &mut R, - (supports_fast_decode, decode_in_chunks_of_size, alphabet): (S, usize, &[u8]), + supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode, ignore_garbage: bool, ) -> UResult<()> { // Based on performance testing - const INPUT_BUFFER_SIZE: usize = 32_usize * 1_024_usize; + const INPUT_BUFFER_SIZE: usize = 32 * 1_024; + + const DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024; + + let alphabet = supports_fast_decode_and_encode.alphabet(); + let decode_in_chunks_of_size = supports_fast_decode_and_encode.valid_decoding_multiple() + * DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + + assert!(decode_in_chunks_of_size > 0); // Note that it's not worth using "data-encoding"'s ignore functionality if "ignore_garbage" is true, because // "data-encoding"'s ignore functionality cannot discard non-ASCII bytes. The data has to be filtered before @@ -627,7 +633,7 @@ mod fast_decode { // Start of buffers // Data that was read from stdin - let mut input_buffer = vec![0_u8; INPUT_BUFFER_SIZE]; + let mut input_buffer = vec![0; INPUT_BUFFER_SIZE]; assert!(!input_buffer.is_empty()); @@ -647,7 +653,7 @@ mod fast_decode { loop { match input.read(&mut input_buffer) { Ok(bytes_read_from_input) => { - if bytes_read_from_input == 0_usize { + if bytes_read_from_input == 0 { break; } @@ -689,53 +695,27 @@ mod fast_decode { } // Decode data in chunks, then place it in `decoded_buffer` - { - let bytes_to_chunk = if bytes_to_steal > 0_usize { - let (stolen_bytes, rest_of_read_buffer_filtered) = - read_buffer_filtered.split_at(bytes_to_steal); - - leftover_buffer.extend(stolen_bytes); - - // After appending the stolen bytes to `leftover_buffer`, it should be the right size - assert!(leftover_buffer.len() == decode_in_chunks_of_size); - - // Decode the old un-decoded data and the stolen bytes, and add the result to - // `decoded_buffer` - supports_fast_decode - .decode_into_vec(&leftover_buffer, &mut decoded_buffer)?; - - // Reset `leftover_buffer` - leftover_buffer.clear(); - - rest_of_read_buffer_filtered - } else { - // Do not need to steal bytes from `read_buffer` - read_buffer_filtered - }; - - let chunks_exact = bytes_to_chunk.chunks_exact(decode_in_chunks_of_size); - - let remainder = chunks_exact.remainder(); - - for sl in chunks_exact { - assert!(sl.len() == decode_in_chunks_of_size); - - supports_fast_decode.decode_into_vec(sl, &mut decoded_buffer)?; - } - - leftover_buffer.extend(remainder); - } + decode_in_chunks_to_buffer( + supports_fast_decode_and_encode, + decode_in_chunks_of_size, + bytes_to_steal, + read_buffer_filtered, + &mut leftover_buffer, + &mut decoded_buffer, + )?; // Write all data in `decoded_buffer` to stdout write_to_stdout(&mut decoded_buffer, &mut stdout_lock)?; } Err(er) => { - if er.kind() == ErrorKind::Interrupted { + let kind = er.kind(); + + if kind == ErrorKind::Interrupted { // TODO // Retry reading? } - return Err(USimpleError::new(1_i32, format!("read error: {er}"))); + return Err(USimpleError::new(1, format_read_error(kind))); } } } @@ -744,7 +724,8 @@ mod fast_decode { // `input` has finished producing data, so the data remaining in the buffers needs to be decoded and printed { // Decode all remaining encoded bytes, placing them in `decoded_buffer` - supports_fast_decode.decode_into_vec(&leftover_buffer, &mut decoded_buffer)?; + supports_fast_decode_and_encode + .decode_into_vec(&leftover_buffer, &mut decoded_buffer)?; // Write all data in `decoded_buffer` to stdout write_to_stdout(&mut decoded_buffer, &mut stdout_lock)?; @@ -753,3 +734,21 @@ mod fast_decode { Ok(()) } } + +fn format_read_error(kind: ErrorKind) -> String { + let kind_string = kind.to_string(); + + // e.g. "is a directory" -> "Is a directory" + let kind_string_uncapitalized = kind_string + .char_indices() + .map(|(index, character)| { + if index == 0 { + character.to_uppercase().collect::() + } else { + character.to_string() + } + }) + .collect::(); + + format!("read error: {kind_string_uncapitalized}") +} diff --git a/src/uucore/src/lib/features/encoding.rs b/src/uucore/src/lib/features/encoding.rs index c4831e032ff..0d2ce622297 100644 --- a/src/uucore/src/lib/features/encoding.rs +++ b/src/uucore/src/lib/features/encoding.rs @@ -3,7 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (encodings) lsbf msbf +// spell-checker:ignore (encodings) lsbf msbf unpadded use crate::error::{UResult, USimpleError}; use data_encoding::Encoding; @@ -41,61 +41,80 @@ pub const BASE2MSBF: Encoding = new_encoding! { bit_order: MostSignificantFirst, }; -pub struct ZEightFiveWrapper {} +pub struct Z85Wrapper {} -pub trait SupportsFastEncode { - fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque) -> UResult<()>; +pub struct EncodingWrapper { + pub alphabet: &'static [u8], + pub encoding: Encoding, + pub unpadded_multiple: usize, + pub valid_decoding_multiple: usize, } -impl SupportsFastEncode for ZEightFiveWrapper { - fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque) -> UResult<()> { - // According to the spec we should not accept inputs whose len is not a multiple of 4. - // However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them. - if input.len() % 4_usize != 0_usize { - return Err(USimpleError::new( - 1_i32, - "error: invalid input (length must be multiple of 4 characters)".to_owned(), - )); +impl EncodingWrapper { + pub fn new( + encoding: Encoding, + valid_decoding_multiple: usize, + unpadded_multiple: usize, + alphabet: &'static [u8], + ) -> Self { + Self { + alphabet, + encoding, + unpadded_multiple, + valid_decoding_multiple, } - - let string = z85::encode(input); - - output.extend(string.as_bytes()); - - Ok(()) } } -impl SupportsFastEncode for Encoding { - // Adapted from `encode_append` in the "data-encoding" crate - fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque) -> UResult<()> { - let output_len = output.len(); +pub trait SupportsFastDecodeAndEncode { + /// Returns the list of characters used by this encoding + fn alphabet(&self) -> &'static [u8]; - output.resize(output_len + self.encode_len(input.len()), 0_u8); - - let make_contiguous_result = output.make_contiguous(); + fn decode_into_vec(&self, input: &[u8], output: &mut Vec) -> UResult<()>; - self.encode_mut(input, &mut (make_contiguous_result[output_len..])); + fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque) -> UResult<()>; - Ok(()) - } + /// Inputs with a length that is a multiple of this number do not have padding when encoded. For instance: + /// + /// "The quick brown" + /// + /// is 15 characters (divisible by 3), so it is encoded in Base64 without padding: + /// + /// "VGhlIHF1aWNrIGJyb3du" + /// + /// While: + /// + /// "The quick brown fox" + /// + /// is 19 characters, which is not divisible by 3, so its Base64 representation has padding: + /// + /// "VGhlIHF1aWNrIGJyb3duIGZveA==" + /// + /// The encoding performed by `fast_encode` depends on this number being correct. + fn unpadded_multiple(&self) -> usize; + + /// Data to decode must be a length that is multiple of this number + /// + /// The decoding performed by `fast_decode` depends on this number being correct. + fn valid_decoding_multiple(&self) -> usize; } -pub trait SupportsFastDecode { - fn decode_into_vec(&self, input: &[u8], output: &mut Vec) -> UResult<()>; -} +impl SupportsFastDecodeAndEncode for Z85Wrapper { + fn alphabet(&self) -> &'static [u8] { + // Z85 alphabet + b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#" + } -impl SupportsFastDecode for ZEightFiveWrapper { fn decode_into_vec(&self, input: &[u8], output: &mut Vec) -> UResult<()> { if input.first() == Some(&b'#') { - return Err(USimpleError::new(1_i32, "error: invalid input".to_owned())); + return Err(USimpleError::new(1, "error: invalid input".to_owned())); } // According to the spec we should not accept inputs whose len is not a multiple of 4. // However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them. - if input.len() % 4_usize != 0_usize { + if input.len() % 4 != 0 { return Err(USimpleError::new( - 1_i32, + 1, "error: invalid input (length must be multiple of 4 characters)".to_owned(), )); }; @@ -103,7 +122,7 @@ impl SupportsFastDecode for ZEightFiveWrapper { let decode_result = match z85::decode(input) { Ok(ve) => ve, Err(_de) => { - return Err(USimpleError::new(1_i32, "error: invalid input".to_owned())); + return Err(USimpleError::new(1, "error: invalid input".to_owned())); } }; @@ -111,23 +130,52 @@ impl SupportsFastDecode for ZEightFiveWrapper { Ok(()) } + + fn valid_decoding_multiple(&self) -> usize { + 5 + } + + fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque) -> UResult<()> { + // According to the spec we should not accept inputs whose len is not a multiple of 4. + // However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them. + if input.len() % 4 != 0 { + return Err(USimpleError::new( + 1, + "error: invalid input (length must be multiple of 4 characters)".to_owned(), + )); + } + + let string = z85::encode(input); + + output.extend(string.as_bytes()); + + Ok(()) + } + + fn unpadded_multiple(&self) -> usize { + 4 + } } -impl SupportsFastDecode for Encoding { +impl SupportsFastDecodeAndEncode for EncodingWrapper { + fn alphabet(&self) -> &'static [u8] { + self.alphabet + } + // Adapted from `decode` in the "data-encoding" crate fn decode_into_vec(&self, input: &[u8], output: &mut Vec) -> UResult<()> { - let decode_len_result = match self.decode_len(input.len()) { + let decode_len_result = match self.encoding.decode_len(input.len()) { Ok(us) => us, Err(_de) => { - return Err(USimpleError::new(1_i32, "error: invalid input".to_owned())); + return Err(USimpleError::new(1, "error: invalid input".to_owned())); } }; let output_len = output.len(); - output.resize(output_len + decode_len_result, 0_u8); + output.resize(output_len + decode_len_result, 0); - match self.decode_mut(input, &mut (output[output_len..])) { + match self.encoding.decode_mut(input, &mut (output[output_len..])) { Ok(us) => { // See: // https://docs.rs/data-encoding/latest/data_encoding/struct.Encoding.html#method.decode_mut @@ -135,10 +183,32 @@ impl SupportsFastDecode for Encoding { output.truncate(output_len + us); } Err(_de) => { - return Err(USimpleError::new(1_i32, "error: invalid input".to_owned())); + return Err(USimpleError::new(1, "error: invalid input".to_owned())); } } Ok(()) } + + fn valid_decoding_multiple(&self) -> usize { + self.valid_decoding_multiple + } + + // Adapted from `encode_append` in the "data-encoding" crate + fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque) -> UResult<()> { + let output_len = output.len(); + + output.resize(output_len + self.encoding.encode_len(input.len()), 0); + + let make_contiguous_result = output.make_contiguous(); + + self.encoding + .encode_mut(input, &mut (make_contiguous_result[output_len..])); + + Ok(()) + } + + fn unpadded_multiple(&self) -> usize { + self.unpadded_multiple + } } diff --git a/tests/by-util/test_basenc.rs b/tests/by-util/test_basenc.rs index 8701911f7fa..ce7e864a301 100644 --- a/tests/by-util/test_basenc.rs +++ b/tests/by-util/test_basenc.rs @@ -31,9 +31,7 @@ fn test_invalid_input() { let error_message = if cfg!(windows) { "basenc: .: Permission denied\n" } else { - // TODO - // Other implementations do not show " (os error 21)" - "basenc: read error: Is a directory (os error 21)\n" + "basenc: read error: Is a directory\n" }; new_ucmd!() .args(&["--base32", "."]) From ea7a543bfe66bd10d3aac9d654ca40a811ab87ce Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Tue, 24 Sep 2024 19:25:29 -0500 Subject: [PATCH 12/17] Fix two bugs. Add property testing. --- Cargo.lock | 103 ++++-- src/uu/base32/Cargo.toml | 3 + src/uu/base32/src/base_common.rs | 147 ++++---- src/uu/base32/tests/property_tests.rs | 430 ++++++++++++++++++++++++ src/uu/factor/Cargo.toml | 3 - src/uucore/src/lib/features/encoding.rs | 17 +- tests/by-util/test_basenc.rs | 31 +- 7 files changed, 607 insertions(+), 127 deletions(-) create mode 100644 src/uu/base32/tests/property_tests.rs diff --git a/Cargo.lock b/Cargo.lock index 5fca20a7118..9514fafc162 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -185,6 +185,21 @@ dependencies = [ "syn 2.0.79", ] +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + [[package]] name = "bitflags" version = "1.3.2" @@ -850,16 +865,6 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" -[[package]] -name = "env_logger" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" -dependencies = [ - "log", - "regex", -] - [[package]] name = "equivalent" version = "1.0.1" @@ -1548,6 +1553,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -1791,22 +1797,37 @@ dependencies = [ "hex", ] +[[package]] +name = "proptest" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c2511913b88df1637da85cc8d96ec8e43a3f8bb8ccb71ee1ac240d6f3df58d" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags 2.6.0", + "lazy_static", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + [[package]] name = "quick-error" -version = "2.0.1" +version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" [[package]] -name = "quickcheck" -version = "1.0.3" +name = "quick-error" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" -dependencies = [ - "env_logger", - "log", - "rand", -] +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" [[package]] name = "quote" @@ -1862,6 +1883,15 @@ dependencies = [ "rand_core", ] +[[package]] +name = "rand_xorshift" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" +dependencies = [ + "rand_core", +] + [[package]] name = "rayon" version = "1.10.0" @@ -2030,6 +2060,18 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rusty-fork" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" +dependencies = [ + "fnv", + "quick-error 1.2.3", + "tempfile", + "wait-timeout", +] + [[package]] name = "same-file" version = "1.0.6" @@ -2396,6 +2438,12 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-ident" version = "1.0.13" @@ -2476,6 +2524,7 @@ name = "uu_base32" version = "0.0.27" dependencies = [ "clap", + "proptest", "uucore", ] @@ -2586,7 +2635,7 @@ dependencies = [ "filetime", "indicatif", "libc", - "quick-error", + "quick-error 2.0.1", "selinux", "uucore", "walkdir", @@ -2730,7 +2779,6 @@ dependencies = [ "num-bigint", "num-prime", "num-traits", - "quickcheck", "rand", "smallvec", "uucore", @@ -3035,7 +3083,7 @@ dependencies = [ "chrono", "clap", "itertools", - "quick-error", + "quick-error 2.0.1", "regex", "uucore", ] @@ -3533,6 +3581,15 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "wait-timeout" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" +dependencies = [ + "libc", +] + [[package]] name = "walkdir" version = "2.5.0" diff --git a/src/uu/base32/Cargo.toml b/src/uu/base32/Cargo.toml index 152091aa3a6..88724360f7e 100644 --- a/src/uu/base32/Cargo.toml +++ b/src/uu/base32/Cargo.toml @@ -20,6 +20,9 @@ path = "src/base32.rs" clap = { workspace = true } uucore = { workspace = true, features = ["encoding"] } +[dev-dependencies] +proptest = "1.5.0" + [[bin]] name = "base32" path = "src/main.rs" diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index 5bee873dad7..9a56d131656 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -7,7 +7,7 @@ use clap::{crate_version, Arg, ArgAction, Command}; use std::fs::File; -use std::io::{ErrorKind, Read, Stdin}; +use std::io::{self, ErrorKind, Read, Stdin}; use std::path::Path; use uucore::display::Quotable; use uucore::encoding::{ @@ -25,7 +25,7 @@ pub const BASE_CMD_PARSE_ERROR: i32 = 1; /// Other implementations default to 76 /// /// This default is only used if no "-w"/"--wrap" argument is passed -const WRAP_DEFAULT: usize = 76; +pub const WRAP_DEFAULT: usize = 76; pub struct Config { pub decode: bool, @@ -158,6 +158,28 @@ pub fn handle_input( ignore_garbage: bool, decode: bool, ) -> UResult<()> { + let supports_fast_decode_and_encode = get_supports_fast_decode_and_encode(format); + + let mut stdout_lock = io::stdout().lock(); + + if decode { + fast_decode::fast_decode( + input, + &mut stdout_lock, + supports_fast_decode_and_encode.as_ref(), + ignore_garbage, + ) + } else { + fast_encode::fast_encode( + input, + &mut stdout_lock, + supports_fast_decode_and_encode.as_ref(), + wrap, + ) + } +} + +pub fn get_supports_fast_decode_and_encode(format: Format) -> Box { const BASE16_VALID_DECODING_MULTIPLE: usize = 2; const BASE2_VALID_DECODING_MULTIPLE: usize = 8; const BASE32_VALID_DECODING_MULTIPLE: usize = 8; @@ -168,7 +190,7 @@ pub fn handle_input( const BASE32_UNPADDED_MULTIPLE: usize = 5; const BASE64_UNPADDED_MULTIPLE: usize = 3; - let supports_fast_decode_and_encode: Box = match format { + match format { Format::Base16 => Box::from(EncodingWrapper::new( HEXUPPER, BASE16_VALID_DECODING_MULTIPLE, @@ -219,26 +241,14 @@ pub fn handle_input( b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=_-", )), Format::Z85 => Box::from(Z85Wrapper {}), - }; - - if decode { - fast_decode::fast_decode( - input, - supports_fast_decode_and_encode.as_ref(), - ignore_garbage, - )?; - } else { - fast_encode::fast_encode(input, supports_fast_decode_and_encode.as_ref(), wrap)?; } - - Ok(()) } -mod fast_encode { +pub mod fast_encode { use crate::base_common::{format_read_error, WRAP_DEFAULT}; use std::{ collections::VecDeque, - io::{self, ErrorKind, Read, StdoutLock, Write}, + io::{self, ErrorKind, Read, Write}, num::NonZeroUsize, }; use uucore::{ @@ -299,17 +309,17 @@ mod fast_encode { fn write_without_line_breaks( encoded_buffer: &mut VecDeque, - stdout_lock: &mut StdoutLock, + output: &mut dyn Write, is_cleanup: bool, ) -> io::Result<()> { // TODO // `encoded_buffer` only has to be a VecDeque if line wrapping is enabled // (`make_contiguous` should be a no-op here) // Refactoring could avoid this call - stdout_lock.write_all(encoded_buffer.make_contiguous())?; + output.write_all(encoded_buffer.make_contiguous())?; if is_cleanup { - stdout_lock.write_all(b"\n")?; + output.write_all(b"\n")?; } else { encoded_buffer.clear(); } @@ -323,7 +333,7 @@ mod fast_encode { ref mut print_buffer, }: &mut LineWrapping, encoded_buffer: &mut VecDeque, - stdout_lock: &mut StdoutLock, + output: &mut dyn Write, is_cleanup: bool, ) -> io::Result<()> { let line_length = line_length.get(); @@ -341,7 +351,7 @@ mod fast_encode { print_buffer.push(b'\n'); } - stdout_lock.write_all(print_buffer)?; + output.write_all(print_buffer)?; // Remove the bytes that were just printed from `encoded_buffer` drop(encoded_buffer.drain(..bytes_added_to_print_buffer)); @@ -351,8 +361,8 @@ mod fast_encode { // Do not write a newline in this case, because two trailing newlines should never be printed } else { // Print the partial line, since this is cleanup and no more data is coming - stdout_lock.write_all(encoded_buffer.make_contiguous())?; - stdout_lock.write_all(b"\n")?; + output.write_all(encoded_buffer.make_contiguous())?; + output.write_all(b"\n")?; } } else { print_buffer.clear(); @@ -361,27 +371,28 @@ mod fast_encode { Ok(()) } - fn write_to_stdout( + fn write_to_output( line_wrapping_option: &mut Option, encoded_buffer: &mut VecDeque, - stdout_lock: &mut StdoutLock, + output: &mut dyn Write, is_cleanup: bool, ) -> io::Result<()> { - // Write all data in `encoded_buffer` to stdout + // Write all data in `encoded_buffer` to `output` if let &mut Some(ref mut li) = line_wrapping_option { - write_with_line_breaks(li, encoded_buffer, stdout_lock, is_cleanup)?; + write_with_line_breaks(li, encoded_buffer, output, is_cleanup)?; } else { - write_without_line_breaks(encoded_buffer, stdout_lock, is_cleanup)?; + write_without_line_breaks(encoded_buffer, output, is_cleanup)?; } Ok(()) } // End of helper functions - pub fn fast_encode( + pub fn fast_encode( input: &mut R, + mut output: W, supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode, - line_wrap: Option, + wrap: Option, ) -> UResult<()> { // Based on performance testing const INPUT_BUFFER_SIZE: usize = 32 * 1_024; @@ -393,10 +404,10 @@ mod fast_encode { assert!(encode_in_chunks_of_size > 0); - // "data-encoding" supports line wrapping, but not arbitrary line wrapping, only certain widths, so line - // wrapping must be implemented here. + // The "data-encoding" crate supports line wrapping, but not arbitrary line wrapping, only certain widths, so + // line wrapping must be handled here. // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L1710 - let mut line_wrapping_option = match line_wrap { + let mut line_wrapping = match wrap { // Line wrapping is disabled because "-w"/"--wrap" was passed with "0" Some(0) => None, // A custom line wrapping value was passed @@ -420,12 +431,10 @@ mod fast_encode { // Data that was read from stdin but has not been encoded yet let mut leftover_buffer = VecDeque::::new(); - // Encoded data that needs to be written to stdout + // Encoded data that needs to be written to output let mut encoded_buffer = VecDeque::::new(); // End of buffers - let mut stdout_lock = io::stdout().lock(); - loop { match input.read(&mut input_buffer) { Ok(bytes_read_from_input) => { @@ -443,6 +452,8 @@ mod fast_encode { // Do not have enough data to encode a chunk, so copy data to `leftover_buffer` and read more leftover_buffer.extend(read_buffer); + assert!(leftover_buffer.len() < encode_in_chunks_of_size); + continue; } @@ -456,13 +467,10 @@ mod fast_encode { &mut leftover_buffer, )?; - // Write all data in `encoded_buffer` to stdout - write_to_stdout( - &mut line_wrapping_option, - &mut encoded_buffer, - &mut stdout_lock, - false, - )?; + assert!(leftover_buffer.len() < encode_in_chunks_of_size); + + // Write all data in `encoded_buffer` to output + write_to_output(&mut line_wrapping, &mut encoded_buffer, &mut output, false)?; } Err(er) => { let kind = er.kind(); @@ -484,23 +492,18 @@ mod fast_encode { supports_fast_decode_and_encode .encode_to_vec_deque(leftover_buffer.make_contiguous(), &mut encoded_buffer)?; - // Write all data in `encoded_buffer` to stdout + // Write all data in `encoded_buffer` to output // `is_cleanup` triggers special cleanup-only logic - write_to_stdout( - &mut line_wrapping_option, - &mut encoded_buffer, - &mut stdout_lock, - true, - )?; + write_to_output(&mut line_wrapping, &mut encoded_buffer, &mut output, true)?; } Ok(()) } } -mod fast_decode { +pub mod fast_decode { use crate::base_common::format_read_error; - use std::io::{self, ErrorKind, Read, StdoutLock, Write}; + use std::io::{self, ErrorKind, Read, Write}; use uucore::{ encoding::SupportsFastDecodeAndEncode, error::{UResult, USimpleError}, @@ -588,12 +591,9 @@ mod fast_decode { Ok(()) } - fn write_to_stdout( - decoded_buffer: &mut Vec, - stdout_lock: &mut StdoutLock, - ) -> io::Result<()> { - // Write all data in `decoded_buffer` to stdout - stdout_lock.write_all(decoded_buffer.as_slice())?; + fn write_to_output(decoded_buffer: &mut Vec, output: &mut dyn Write) -> io::Result<()> { + // Write all data in `decoded_buffer` to `output` + output.write_all(decoded_buffer.as_slice())?; decoded_buffer.clear(); @@ -601,10 +601,9 @@ mod fast_decode { } // End of helper functions - /// `encoding`, `decode_in_chunks_of_size`, and `alphabet` are passed in a tuple to indicate that they are - /// logically tied - pub fn fast_decode( + pub fn fast_decode( input: &mut R, + mut output: &mut W, supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode, ignore_garbage: bool, ) -> UResult<()> { @@ -640,7 +639,7 @@ mod fast_decode { // Data that was read from stdin but has not been decoded yet let mut leftover_buffer = Vec::::new(); - // Decoded data that needs to be written to stdout + // Decoded data that needs to be written to `output` let mut decoded_buffer = Vec::::new(); // Buffer that will be used when "ignore_garbage" is true, and the chunk read from "input" contains garbage @@ -648,8 +647,6 @@ mod fast_decode { let mut non_garbage_buffer = Vec::::new(); // End of buffers - let mut stdout_lock = io::stdout().lock(); - loop { match input.read(&mut input_buffer) { Ok(bytes_read_from_input) => { @@ -687,10 +684,12 @@ mod fast_decode { // How many bytes to steal from `read_buffer` to get `leftover_buffer` to the right size let bytes_to_steal = decode_in_chunks_of_size - leftover_buffer.len(); - if bytes_to_steal > bytes_read_from_input { + if bytes_to_steal > read_buffer_filtered.len() { // Do not have enough data to decode a chunk, so copy data to `leftover_buffer` and read more leftover_buffer.extend(read_buffer_filtered); + assert!(leftover_buffer.len() < decode_in_chunks_of_size); + continue; } @@ -700,12 +699,14 @@ mod fast_decode { decode_in_chunks_of_size, bytes_to_steal, read_buffer_filtered, - &mut leftover_buffer, &mut decoded_buffer, + &mut leftover_buffer, )?; - // Write all data in `decoded_buffer` to stdout - write_to_stdout(&mut decoded_buffer, &mut stdout_lock)?; + assert!(leftover_buffer.len() < decode_in_chunks_of_size); + + // Write all data in `decoded_buffer` to `output` + write_to_output(&mut decoded_buffer, &mut output)?; } Err(er) => { let kind = er.kind(); @@ -727,8 +728,8 @@ mod fast_decode { supports_fast_decode_and_encode .decode_into_vec(&leftover_buffer, &mut decoded_buffer)?; - // Write all data in `decoded_buffer` to stdout - write_to_stdout(&mut decoded_buffer, &mut stdout_lock)?; + // Write all data in `decoded_buffer` to `output` + write_to_output(&mut decoded_buffer, &mut output)?; } Ok(()) @@ -739,7 +740,7 @@ fn format_read_error(kind: ErrorKind) -> String { let kind_string = kind.to_string(); // e.g. "is a directory" -> "Is a directory" - let kind_string_uncapitalized = kind_string + let kind_string_capitalized = kind_string .char_indices() .map(|(index, character)| { if index == 0 { @@ -750,5 +751,5 @@ fn format_read_error(kind: ErrorKind) -> String { }) .collect::(); - format!("read error: {kind_string_uncapitalized}") + format!("read error: {kind_string_capitalized}") } diff --git a/src/uu/base32/tests/property_tests.rs b/src/uu/base32/tests/property_tests.rs new file mode 100644 index 00000000000..0f2393c42ab --- /dev/null +++ b/src/uu/base32/tests/property_tests.rs @@ -0,0 +1,430 @@ +// spell-checker:ignore lsbf msbf proptest + +use proptest::{prelude::TestCaseError, prop_assert, prop_assert_eq, test_runner::TestRunner}; +use std::io::Cursor; +use uu_base32::base_common::{fast_decode, fast_encode, get_supports_fast_decode_and_encode}; +use uucore::encoding::{Format, SupportsFastDecodeAndEncode}; + +const CASES: u32 = { + #[cfg(debug_assertions)] + { + 32 + } + + #[cfg(not(debug_assertions))] + { + 128 + } +}; + +const NORMAL_INPUT_SIZE_LIMIT: usize = { + #[cfg(debug_assertions)] + { + // 256 kibibytes + 256 * 1024 + } + + #[cfg(not(debug_assertions))] + { + // 4 mebibytes + 4 * 1024 * 1024 + } +}; + +const LARGE_INPUT_SIZE_LIMIT: usize = 4 * NORMAL_INPUT_SIZE_LIMIT; + +// Note that `TestRunner`s cannot be reused +fn get_test_runner() -> TestRunner { + TestRunner::new(proptest::test_runner::Config { + cases: CASES, + failure_persistence: None, + + ..proptest::test_runner::Config::default() + }) +} + +fn generic_round_trip(format: Format) { + let supports_fast_decode_and_encode = get_supports_fast_decode_and_encode(format); + + let supports_fast_decode_and_encode_ref = supports_fast_decode_and_encode.as_ref(); + + // Make sure empty inputs round trip + { + get_test_runner() + .run( + &( + proptest::bool::ANY, + proptest::bool::ANY, + proptest::option::of(0_usize..512_usize), + ), + |(ignore_garbage, line_wrap_zero, line_wrap)| { + configurable_round_trip( + format, + supports_fast_decode_and_encode_ref, + ignore_garbage, + line_wrap_zero, + line_wrap, + // Do not add garbage + Vec::<(usize, u8)>::new(), + // Empty input + Vec::::new(), + ) + }, + ) + .unwrap(); + } + + // Unusually large line wrapping settings + { + get_test_runner() + .run( + &( + proptest::bool::ANY, + proptest::bool::ANY, + proptest::option::of(512_usize..65_535_usize), + proptest::collection::vec(proptest::num::u8::ANY, 0..NORMAL_INPUT_SIZE_LIMIT), + ), + |(ignore_garbage, line_wrap_zero, line_wrap, input)| { + configurable_round_trip( + format, + supports_fast_decode_and_encode_ref, + ignore_garbage, + line_wrap_zero, + line_wrap, + // Do not add garbage + Vec::<(usize, u8)>::new(), + input, + ) + }, + ) + .unwrap(); + } + + // Spend more time on sane line wrapping settings + { + get_test_runner() + .run( + &( + proptest::bool::ANY, + proptest::bool::ANY, + proptest::option::of(0_usize..512_usize), + proptest::collection::vec(proptest::num::u8::ANY, 0..NORMAL_INPUT_SIZE_LIMIT), + ), + |(ignore_garbage, line_wrap_zero, line_wrap, input)| { + configurable_round_trip( + format, + supports_fast_decode_and_encode_ref, + ignore_garbage, + line_wrap_zero, + line_wrap, + // Do not add garbage + Vec::<(usize, u8)>::new(), + input, + ) + }, + ) + .unwrap(); + } + + // Test with garbage data + { + get_test_runner() + .run( + &( + proptest::bool::ANY, + proptest::bool::ANY, + proptest::option::of(0_usize..512_usize), + // Garbage data to insert + proptest::collection::vec( + ( + // Random index + proptest::num::usize::ANY, + // In all of the encodings being tested, non-ASCII bytes are garbage + 128_u8..=u8::MAX, + ), + 0..4_096, + ), + proptest::collection::vec(proptest::num::u8::ANY, 0..NORMAL_INPUT_SIZE_LIMIT), + ), + |(ignore_garbage, line_wrap_zero, line_wrap, garbage_data, input)| { + configurable_round_trip( + format, + supports_fast_decode_and_encode_ref, + ignore_garbage, + line_wrap_zero, + line_wrap, + garbage_data, + input, + ) + }, + ) + .unwrap(); + } + + // Test small inputs + { + get_test_runner() + .run( + &( + proptest::bool::ANY, + proptest::bool::ANY, + proptest::option::of(0_usize..512_usize), + proptest::collection::vec(proptest::num::u8::ANY, 0..1_024), + ), + |(ignore_garbage, line_wrap_zero, line_wrap, input)| { + configurable_round_trip( + format, + supports_fast_decode_and_encode_ref, + ignore_garbage, + line_wrap_zero, + line_wrap, + // Do not add garbage + Vec::<(usize, u8)>::new(), + input, + ) + }, + ) + .unwrap(); + } + + // Test small inputs with garbage data + { + get_test_runner() + .run( + &( + proptest::bool::ANY, + proptest::bool::ANY, + proptest::option::of(0_usize..512_usize), + // Garbage data to insert + proptest::collection::vec( + ( + // Random index + proptest::num::usize::ANY, + // In all of the encodings being tested, non-ASCII bytes are garbage + 128_u8..=u8::MAX, + ), + 0..1_024, + ), + proptest::collection::vec(proptest::num::u8::ANY, 0..1_024), + ), + |(ignore_garbage, line_wrap_zero, line_wrap, garbage_data, input)| { + configurable_round_trip( + format, + supports_fast_decode_and_encode_ref, + ignore_garbage, + line_wrap_zero, + line_wrap, + garbage_data, + input, + ) + }, + ) + .unwrap(); + } + + // Test large inputs + { + get_test_runner() + .run( + &( + proptest::bool::ANY, + proptest::bool::ANY, + proptest::option::of(0_usize..512_usize), + proptest::collection::vec(proptest::num::u8::ANY, 0..LARGE_INPUT_SIZE_LIMIT), + ), + |(ignore_garbage, line_wrap_zero, line_wrap, input)| { + configurable_round_trip( + format, + supports_fast_decode_and_encode_ref, + ignore_garbage, + line_wrap_zero, + line_wrap, + // Do not add garbage + Vec::<(usize, u8)>::new(), + input, + ) + }, + ) + .unwrap(); + } +} + +fn configurable_round_trip( + format: Format, + supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode, + ignore_garbage: bool, + line_wrap_zero: bool, + line_wrap: Option, + garbage_data: Vec<(usize, u8)>, + mut input: Vec, +) -> Result<(), TestCaseError> { + // Z85 only accepts inputs with lengths divisible by 4 + if let Format::Z85 = format { + // Reduce length of "input" until it is divisible by 4 + input.truncate((input.len() / 4) * 4); + + assert!((input.len() % 4) == 0); + } + + let line_wrap_to_use = if line_wrap_zero { Some(0) } else { line_wrap }; + + let input_len = input.len(); + + let garbage_data_len = garbage_data.len(); + + let garbage_data_is_empty = garbage_data_len == 0; + + let (input, encoded) = { + let mut output = Vec::with_capacity(input_len * 8); + + let mut cursor = Cursor::new(input); + + fast_encode::fast_encode( + &mut cursor, + &mut output, + supports_fast_decode_and_encode, + line_wrap_to_use, + ) + .unwrap(); + + (cursor.into_inner(), output) + }; + + let encoded_or_encoded_with_garbage = if garbage_data_is_empty { + encoded + } else { + let encoded_len = encoded.len(); + + let encoded_highest_index = match encoded_len.checked_sub(1) { + Some(0) | None => None, + Some(x) => Some(x), + }; + + let mut garbage_data_indexed = vec![Option::::None; encoded_len]; + + let mut encoded_with_garbage = Vec::::with_capacity(encoded_len + garbage_data_len); + + for (index, garbage_byte) in garbage_data { + if let Some(x) = encoded_highest_index { + let index_to_use = index % x; + + garbage_data_indexed[index_to_use] = Some(garbage_byte); + } else { + encoded_with_garbage.push(garbage_byte); + } + } + + for (index, encoded_byte) in encoded.into_iter().enumerate() { + encoded_with_garbage.push(encoded_byte); + + if let Some(garbage_byte) = garbage_data_indexed[index] { + encoded_with_garbage.push(garbage_byte); + } + } + + encoded_with_garbage + }; + + match line_wrap_to_use { + Some(0) => { + let line_endings_count = encoded_or_encoded_with_garbage + .iter() + .filter(|byte| **byte == b'\n') + .count(); + + // If line wrapping is disabled, there should only be one '\n' character (at the very end of the output) + prop_assert_eq!(line_endings_count, 1); + } + _ => { + // TODO + // Validate other line wrapping settings + } + } + + let decoded_or_error = { + let mut output = Vec::with_capacity(input_len); + + let mut cursor = Cursor::new(encoded_or_encoded_with_garbage); + + match fast_decode::fast_decode( + &mut cursor, + &mut output, + supports_fast_decode_and_encode, + ignore_garbage, + ) { + Ok(()) => Ok(output), + Err(er) => Err(er), + } + }; + + let made_round_trip = match decoded_or_error { + Ok(ve) => input.as_slice() == ve.as_slice(), + Err(_) => false, + }; + + let result_was_correct = if garbage_data_is_empty || ignore_garbage { + // If there was no garbage data added, or if "ignore_garbage" was enabled, expect the round trip to succeed + made_round_trip + } else { + // If garbage data was added, and "ignore_garbage" was disabled, expect the round trip to fail + + !made_round_trip + }; + + if !result_was_correct { + eprintln!( + "\ +(configurable_round_trip) FAILURE +format: {format:?} +ignore_garbage: {ignore_garbage} +line_wrap_to_use: {line_wrap_to_use:?} +garbage_data_len: {garbage_data_len} +input_len: {input_len} +", + ); + } + + prop_assert!(result_was_correct); + + Ok(()) +} + +#[test] +fn base16_round_trip() { + generic_round_trip(Format::Base16); +} + +#[test] +fn base2lsbf_round_trip() { + generic_round_trip(Format::Base2Lsbf); +} + +#[test] +fn base2msbf_round_trip() { + generic_round_trip(Format::Base2Msbf); +} + +#[test] +fn base32_round_trip() { + generic_round_trip(Format::Base32); +} + +#[test] +fn base32hex_round_trip() { + generic_round_trip(Format::Base32Hex); +} + +#[test] +fn base64_round_trip() { + generic_round_trip(Format::Base64); +} + +#[test] +fn base64url_round_trip() { + generic_round_trip(Format::Base64Url); +} + +#[test] +fn z85_round_trip() { + generic_round_trip(Format::Z85); +} diff --git a/src/uu/factor/Cargo.toml b/src/uu/factor/Cargo.toml index 49e836befa3..e28db8e6377 100644 --- a/src/uu/factor/Cargo.toml +++ b/src/uu/factor/Cargo.toml @@ -26,9 +26,6 @@ uucore = { workspace = true } num-bigint = { workspace = true } num-prime = { workspace = true } -[dev-dependencies] -quickcheck = "1.0.3" - [[bin]] name = "factor" path = "src/main.rs" diff --git a/src/uucore/src/lib/features/encoding.rs b/src/uucore/src/lib/features/encoding.rs index 0d2ce622297..b213da00761 100644 --- a/src/uucore/src/lib/features/encoding.rs +++ b/src/uucore/src/lib/features/encoding.rs @@ -19,7 +19,7 @@ pub mod for_cksum { pub use data_encoding::BASE64; } -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Debug)] pub enum Format { Base64, Base64Url, @@ -57,6 +57,12 @@ impl EncodingWrapper { unpadded_multiple: usize, alphabet: &'static [u8], ) -> Self { + assert!(valid_decoding_multiple > 0); + + assert!(unpadded_multiple > 0); + + assert!(!alphabet.is_empty()); + Self { alphabet, encoding, @@ -110,15 +116,6 @@ impl SupportsFastDecodeAndEncode for Z85Wrapper { return Err(USimpleError::new(1, "error: invalid input".to_owned())); } - // According to the spec we should not accept inputs whose len is not a multiple of 4. - // However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them. - if input.len() % 4 != 0 { - return Err(USimpleError::new( - 1, - "error: invalid input (length must be multiple of 4 characters)".to_owned(), - )); - }; - let decode_result = match z85::decode(input) { Ok(ve) => ve, Err(_de) => { diff --git a/tests/by-util/test_basenc.rs b/tests/by-util/test_basenc.rs index ce7e864a301..437413a26b7 100644 --- a/tests/by-util/test_basenc.rs +++ b/tests/by-util/test_basenc.rs @@ -45,7 +45,6 @@ fn test_base64() { .arg("--base64") .pipe_in("to>be?") .succeeds() - .no_stderr() .stdout_only("dG8+YmU/\n"); } @@ -55,7 +54,6 @@ fn test_base64_decode() { .args(&["--base64", "-d"]) .pipe_in("dG8+YmU/") .succeeds() - .no_stderr() .stdout_only("to>be?"); } @@ -65,7 +63,6 @@ fn test_base64url() { .arg("--base64url") .pipe_in("to>be?") .succeeds() - .no_stderr() .stdout_only("dG8-YmU_\n"); } @@ -75,7 +72,6 @@ fn test_base64url_decode() { .args(&["--base64url", "-d"]) .pipe_in("dG8-YmU_") .succeeds() - .no_stderr() .stdout_only("to>be?"); } @@ -85,7 +81,6 @@ fn test_base32() { .arg("--base32") .pipe_in("nice>base?") .succeeds() - .no_stderr() .stdout_only("NZUWGZJ6MJQXGZJ7\n"); // spell-checker:disable-line } @@ -95,7 +90,6 @@ fn test_base32_decode() { .args(&["--base32", "-d"]) .pipe_in("NZUWGZJ6MJQXGZJ7") // spell-checker:disable-line .succeeds() - .no_stderr() .stdout_only("nice>base?"); } @@ -105,7 +99,6 @@ fn test_base32hex() { .arg("--base32hex") .pipe_in("nice>base?") .succeeds() - .no_stderr() .stdout_only("DPKM6P9UC9GN6P9V\n"); // spell-checker:disable-line } @@ -115,7 +108,6 @@ fn test_base32hex_decode() { .args(&["--base32hex", "-d"]) .pipe_in("DPKM6P9UC9GN6P9V") // spell-checker:disable-line .succeeds() - .no_stderr() .stdout_only("nice>base?"); } @@ -125,7 +117,6 @@ fn test_base16() { .arg("--base16") .pipe_in("Hello, World!") .succeeds() - .no_stderr() .stdout_only("48656C6C6F2C20576F726C6421\n"); } @@ -135,7 +126,6 @@ fn test_base16_decode() { .args(&["--base16", "-d"]) .pipe_in("48656C6C6F2C20576F726C6421") .succeeds() - .no_stderr() .stdout_only("Hello, World!"); } @@ -145,7 +135,6 @@ fn test_base2msbf() { .arg("--base2msbf") .pipe_in("msbf") .succeeds() - .no_stderr() .stdout_only("01101101011100110110001001100110\n"); } @@ -155,7 +144,6 @@ fn test_base2msbf_decode() { .args(&["--base2msbf", "-d"]) .pipe_in("01101101011100110110001001100110") .succeeds() - .no_stderr() .stdout_only("msbf"); } @@ -165,7 +153,6 @@ fn test_base2lsbf() { .arg("--base2lsbf") .pipe_in("lsbf") .succeeds() - .no_stderr() .stdout_only("00110110110011100100011001100110\n"); } @@ -175,7 +162,6 @@ fn test_base2lsbf_decode() { .args(&["--base2lsbf", "-d"]) .pipe_in("00110110110011100100011001100110") .succeeds() - .no_stderr() .stdout_only("lsbf"); } @@ -194,7 +180,6 @@ fn test_choose_last_encoding_z85() { ]) .pipe_in("Hello, World") .succeeds() - .no_stderr() .stdout_only("nm=QNz.92jz/PV8\n"); } @@ -213,7 +198,6 @@ fn test_choose_last_encoding_base64() { ]) .pipe_in("Hello, World!") .succeeds() - .no_stderr() .stdout_only("SGVsbG8sIFdvcmxkIQ==\n"); // spell-checker:disable-line } @@ -232,7 +216,6 @@ fn test_choose_last_encoding_base2lsbf() { ]) .pipe_in("lsbf") .succeeds() - .no_stderr() .stdout_only("00110110110011100100011001100110\n"); } @@ -253,6 +236,18 @@ fn test_base32_decode_repeated() { ]) .pipe_in("NZUWGZJ6MJQXGZJ7") // spell-checker:disable-line .succeeds() - .no_stderr() .stdout_only("nice>base?"); } + +// The restriction that input length has to be divisible by 4 only applies to data being encoded with Z85, not to the +// decoding of Z85-encoded data +#[test] +fn test_z85_length_check() { + new_ucmd!() + .args(&["--decode", "--z85"]) + // Input has length 10, not divisible by 4 + // spell-checker:disable-next-line + .pipe_in("f!$Kwh8WxM") + .succeeds() + .stdout_only("12345678"); +} From 70c3f49430c596c04e6199539b9a713cced50279 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Tue, 24 Sep 2024 19:37:43 -0500 Subject: [PATCH 13/17] Add spelling exception --- src/uu/base32/Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/uu/base32/Cargo.toml b/src/uu/base32/Cargo.toml index 88724360f7e..abd39787359 100644 --- a/src/uu/base32/Cargo.toml +++ b/src/uu/base32/Cargo.toml @@ -1,3 +1,5 @@ +# spell-checker:ignore proptest + [package] name = "uu_base32" version = "0.0.27" From 2c25ae28389b775d27aca77913d671849526bf35 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Tue, 24 Sep 2024 19:56:50 -0500 Subject: [PATCH 14/17] Fix spelling exception comment --- src/uu/base32/src/base_common.rs | 19 ++++++++++--------- src/uucore/src/lib/features/encoding.rs | 3 ++- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index 9a56d131656..53f507e0d9e 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -740,16 +740,17 @@ fn format_read_error(kind: ErrorKind) -> String { let kind_string = kind.to_string(); // e.g. "is a directory" -> "Is a directory" - let kind_string_capitalized = kind_string - .char_indices() - .map(|(index, character)| { - if index == 0 { - character.to_uppercase().collect::() - } else { - character.to_string() + let mut kind_string_capitalized = String::with_capacity(kind_string.len()); + + for (index, ch) in kind_string.char_indices() { + if index == 0 { + for cha in ch.to_uppercase() { + kind_string_capitalized.push(cha); } - }) - .collect::(); + } else { + kind_string_capitalized.push(ch); + } + } format!("read error: {kind_string_capitalized}") } diff --git a/src/uucore/src/lib/features/encoding.rs b/src/uucore/src/lib/features/encoding.rs index b213da00761..b27d3a1f9b5 100644 --- a/src/uucore/src/lib/features/encoding.rs +++ b/src/uucore/src/lib/features/encoding.rs @@ -3,7 +3,8 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (encodings) lsbf msbf unpadded +// spell-checker:ignore (encodings) lsbf msbf +// spell-checker:ignore unpadded use crate::error::{UResult, USimpleError}; use data_encoding::Encoding; From 360ed8fbd54efdb6a49ad89487280fe1de6821dd Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Thu, 26 Sep 2024 20:16:16 -0500 Subject: [PATCH 15/17] Add BENCHMARKING.md --- src/uu/basenc/BENCHMARKING.md | 185 ++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 src/uu/basenc/BENCHMARKING.md diff --git a/src/uu/basenc/BENCHMARKING.md b/src/uu/basenc/BENCHMARKING.md new file mode 100644 index 00000000000..7b1539de317 --- /dev/null +++ b/src/uu/basenc/BENCHMARKING.md @@ -0,0 +1,185 @@ + + +# Benchmarking base32, base64, and basenc + +Note that the functionality of the `base32` and `base64` programs is identical to that of the `basenc` program, using +the "--base32" and "--base64" options, respectively. For that reason, it is only necessary to benchmark `basenc`. + +To compare the runtime performance of the uutils implementation with the GNU Core Utilities implementation, you can +use a benchmarking tool like [hyperfine][0]. + +You can install hyperfine from source with Cargo: + +```Shell +cargo install --locked --hyperfine +``` + +See the hyperfine repository for [alternative installation methods][1]. + +hyperfine currently does not measure maximum memory usage. Memory usage can be benchmarked using [poop][2], or +[toybox][3]'s "time" subcommand (both are Linux only). + +Next, build the `basenc` binary using the release profile: + +```Shell +cargo build --package uu_basenc --profile release +``` + +## Expected performance + +uutils' `basenc` performs streaming decoding and encoding, and therefore should perform all operations with a constant +maximum memory usage, regardless of the size of the input. Release builds currently use less than 3 mebibytes of +memory, and memory usage greater than 10 mebibytes should be considered a bug. + +As of September 2024, uutils' `basenc` has runtime performance equal to or superior to GNU Core Utilities' `basenc` in +in most scenarios. uutils' `basenc` uses slightly more memory, but given how small these quantities are in absolute +terms (see above), this is highly unlikely to be practically relevant to users. + +## Benchmark results (2024-09-27) + +### Setup + +```Shell +# Use uutils' dd to create a 1 gibibyte in-memory file filled with random bytes (Linux only). +# On other platforms, you can use /tmp instead of /dev/shm, but note that /tmp is not guaranteed to be in-memory. +coreutils dd if=/dev/urandom of=/dev/shm/one-random-gibibyte bs=1024 count=1048576 + +# Encode this file for use in decoding performance testing +/usr/bin/basenc --base32hex -- /dev/shm/one-random-gibibyte 1>/dev/shm/one-random-gibibyte-base32hex-encoded +/usr/bin/basenc --z85 -- /dev/shm/one-random-gibibyte 1>/dev/shm/one-random-gibibyte-z85-encoded +``` + +### Programs being tested + +uutils' `basenc`: + +``` +❯ git rev-list HEAD | coreutils head -n 1 -- - +a0718ef0ffd50539a2e2bc0095c9fadcd70ab857 +``` + +GNU Core Utilities' `basenc`: + +``` +❯ /usr/bin/basenc --version | coreutils head -n 1 -- - +basenc (GNU coreutils) 9.4 +``` + +### Encoding performance + +#### "--base64", default line wrapping (76 characters) + +➕ Faster than GNU Core Utilities + +``` +❯ hyperfine \ + --sort \ + command \ + -- \ + '/usr/bin/basenc --base64 -- /dev/shm/one-random-gibibyte 1>/dev/null' \ + './target/release/basenc --base64 -- /dev/shm/one-random-gibibyte 1>/dev/null' + +Benchmark 1: /usr/bin/basenc --base64 -- /dev/shm/one-random-gibibyte 1>/dev/null + Time (mean ± σ): 965.1 ms ± 7.9 ms [User: 766.2 ms, System: 193.4 ms] + Range (min … max): 950.2 ms … 976.9 ms 10 runs + +Benchmark 2: ./target/release/basenc --base64 -- /dev/shm/one-random-gibibyte 1>/dev/null + Time (mean ± σ): 696.6 ms ± 9.1 ms [User: 574.9 ms, System: 117.3 ms] + Range (min … max): 683.1 ms … 713.5 ms 10 runs + +Relative speed comparison + 1.39 ± 0.02 /usr/bin/basenc --base64 -- /dev/shm/one-random-gibibyte 1>/dev/null + 1.00 ./target/release/basenc --base64 -- /dev/shm/one-random-gibibyte 1>/dev/null +``` + +#### "--base16", no line wrapping + +➖ Slower than GNU Core Utilities + +``` +❯ poop \ + '/usr/bin/basenc --base16 --wrap 0 -- /dev/shm/one-random-gibibyte' \ + './target/release/basenc --base16 --wrap 0 -- /dev/shm/one-random-gibibyte' + +Benchmark 1 (6 runs): /usr/bin/basenc --base16 --wrap 0 -- /dev/shm/one-random-gibibyte + measurement mean ± σ min … max outliers delta + wall_time 836ms ± 13.3ms 822ms … 855ms 0 ( 0%) 0% + peak_rss 2.05MB ± 73.0KB 1.94MB … 2.12MB 0 ( 0%) 0% + cpu_cycles 2.85G ± 32.8M 2.82G … 2.91G 0 ( 0%) 0% + instructions 14.0G ± 58.7 14.0G … 14.0G 0 ( 0%) 0% + cache_references 70.0M ± 6.48M 63.7M … 78.8M 0 ( 0%) 0% + cache_misses 582K ± 172K 354K … 771K 0 ( 0%) 0% + branch_misses 667K ± 4.55K 662K … 674K 0 ( 0%) 0% +Benchmark 2 (6 runs): ./target/release/basenc --base16 --wrap 0 -- /dev/shm/one-random-gibibyte + measurement mean ± σ min … max outliers delta + wall_time 884ms ± 6.38ms 878ms … 895ms 0 ( 0%) 💩+ 5.7% ± 1.6% + peak_rss 2.65MB ± 66.8KB 2.55MB … 2.74MB 0 ( 0%) 💩+ 29.3% ± 4.4% + cpu_cycles 3.15G ± 8.61M 3.14G … 3.16G 0 ( 0%) 💩+ 10.6% ± 1.1% + instructions 10.5G ± 275 10.5G … 10.5G 0 ( 0%) ⚡- 24.9% ± 0.0% + cache_references 93.5M ± 6.10M 87.2M … 104M 0 ( 0%) 💩+ 33.7% ± 11.6% + cache_misses 415K ± 52.3K 363K … 474K 0 ( 0%) - 28.8% ± 28.0% + branch_misses 1.43M ± 4.82K 1.42M … 1.43M 0 ( 0%) 💩+113.9% ± 0.9% +``` + +### Decoding performance + +#### "--base32hex" + +➕ Faster than GNU Core Utilities + +``` +❯ hyperfine \ + --sort \ + command \ + -- \ + '/usr/bin/basenc --base32hex --decode -- /dev/shm/one-random-gibibyte-base32hex-encoded 1>/dev/null' \ + './target/release/basenc --base32hex --decode -- /dev/shm/one-random-gibibyte-base32hex-encoded 1>/dev/null' + +Benchmark 1: /usr/bin/basenc --base32hex --decode -- /dev/shm/one-random-gibibyte-base32hex-encoded 1>/dev/null + Time (mean ± σ): 7.154 s ± 0.082 s [User: 6.802 s, System: 0.323 s] + Range (min … max): 7.051 s … 7.297 s 10 runs + +Benchmark 2: ./target/release/basenc --base32hex --decode -- /dev/shm/one-random-gibibyte-base32hex-encoded 1>/dev/null + Time (mean ± σ): 2.679 s ± 0.025 s [User: 2.446 s, System: 0.221 s] + Range (min … max): 2.649 s … 2.718 s 10 runs + +Relative speed comparison + 2.67 ± 0.04 /usr/bin/basenc --base32hex --decode -- /dev/shm/one-random-gibibyte-base32hex-encoded 1>/dev/null + 1.00 ./target/release/basenc --base32hex --decode -- /dev/shm/one-random-gibibyte-base32hex-encoded 1>/dev/null +``` + +#### "--z85", with "--ignore-garbage" + +➕ Faster than GNU Core Utilities + +``` +❯ poop \ + '/usr/bin/basenc --decode --ignore-garbage --z85 -- /dev/shm/one-random-gibibyte-z85-encoded' \ + './target/release/basenc --decode --ignore-garbage --z85 -- /dev/shm/one-random-gibibyte-z85-encoded' + +Benchmark 1 (3 runs): /usr/bin/basenc --decode --ignore-garbage --z85 -- /dev/shm/one-random-gibibyte-z85-encoded + measurement mean ± σ min … max outliers delta + wall_time 14.4s ± 68.4ms 14.3s … 14.4s 0 ( 0%) 0% + peak_rss 1.98MB ± 10.8KB 1.97MB … 1.99MB 0 ( 0%) 0% + cpu_cycles 58.4G ± 211M 58.3G … 58.7G 0 ( 0%) 0% + instructions 74.7G ± 64.0 74.7G … 74.7G 0 ( 0%) 0% + cache_references 41.8M ± 624K 41.2M … 42.4M 0 ( 0%) 0% + cache_misses 693K ± 118K 567K … 802K 0 ( 0%) 0% + branch_misses 1.24G ± 183K 1.24G … 1.24G 0 ( 0%) 0% +Benchmark 2 (3 runs): ./target/release/basenc --decode --ignore-garbage --z85 -- /dev/shm/one-random-gibibyte-z85-encoded + measurement mean ± σ min … max outliers delta + wall_time 2.80s ± 17.9ms 2.79s … 2.82s 0 ( 0%) ⚡- 80.5% ± 0.8% + peak_rss 2.61MB ± 67.4KB 2.57MB … 2.69MB 0 ( 0%) 💩+ 31.9% ± 5.5% + cpu_cycles 10.8G ± 27.9M 10.8G … 10.9G 0 ( 0%) ⚡- 81.5% ± 0.6% + instructions 39.0G ± 353 39.0G … 39.0G 0 ( 0%) ⚡- 47.7% ± 0.0% + cache_references 114M ± 2.43M 112M … 116M 0 ( 0%) 💩+173.3% ± 9.6% + cache_misses 1.06M ± 288K 805K … 1.37M 0 ( 0%) + 52.6% ± 72.0% + branch_misses 1.18M ± 14.7K 1.16M … 1.19M 0 ( 0%) ⚡- 99.9% ± 0.0% +``` + +[0]: https://github.com/sharkdp/hyperfine +[1]: https://github.com/sharkdp/hyperfine?tab=readme-ov-file#installation +[2]: https://github.com/andrewrk/poop +[3]: https://landley.net/toybox/ From 9fa405fad60343c796a05b8f40a5e799787a5d75 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Fri, 27 Sep 2024 11:35:44 -0500 Subject: [PATCH 16/17] Update BENCHMARKING.md per PR comment --- src/uu/basenc/BENCHMARKING.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/uu/basenc/BENCHMARKING.md b/src/uu/basenc/BENCHMARKING.md index 7b1539de317..f007b82014b 100644 --- a/src/uu/basenc/BENCHMARKING.md +++ b/src/uu/basenc/BENCHMARKING.md @@ -10,14 +10,6 @@ the "--base32" and "--base64" options, respectively. For that reason, it is only To compare the runtime performance of the uutils implementation with the GNU Core Utilities implementation, you can use a benchmarking tool like [hyperfine][0]. -You can install hyperfine from source with Cargo: - -```Shell -cargo install --locked --hyperfine -``` - -See the hyperfine repository for [alternative installation methods][1]. - hyperfine currently does not measure maximum memory usage. Memory usage can be benchmarked using [poop][2], or [toybox][3]'s "time" subcommand (both are Linux only). From 32e1c54c789de5fecc3b3818eeb66475b92eb808 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sat, 5 Oct 2024 08:17:10 -0500 Subject: [PATCH 17/17] Fix "coreutils manpage base64" bug --- Cargo.lock | 1 + src/uu/base32/src/base32.rs | 19 ++---- src/uu/base32/src/base_common.rs | 83 +++++++++++++------------ src/uu/base64/Cargo.toml | 1 + src/uu/base64/src/base64.rs | 24 +++---- src/uu/basenc/BENCHMARKING.md | 2 +- src/uu/basenc/src/basenc.rs | 22 ++----- src/uucore/src/lib/features/encoding.rs | 4 +- tests/by-util/test_base64.rs | 35 +++++++++++ tests/by-util/test_basenc.rs | 3 +- 10 files changed, 103 insertions(+), 91 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9514fafc162..5e99683fb47 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2532,6 +2532,7 @@ dependencies = [ name = "uu_base64" version = "0.0.27" dependencies = [ + "clap", "uu_base32", "uucore", ] diff --git a/src/uu/base32/src/base32.rs b/src/uu/base32/src/base32.rs index 09250421c25..46a0361ea4a 100644 --- a/src/uu/base32/src/base32.rs +++ b/src/uu/base32/src/base32.rs @@ -3,13 +3,11 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use std::io::{stdin, Read}; +pub mod base_common; use clap::Command; use uucore::{encoding::Format, error::UResult, help_about, help_usage}; -pub mod base_common; - const ABOUT: &str = help_about!("base32.md"); const USAGE: &str = help_usage!("base32.md"); @@ -17,20 +15,11 @@ const USAGE: &str = help_usage!("base32.md"); pub fn uumain(args: impl uucore::Args) -> UResult<()> { let format = Format::Base32; - let config: base_common::Config = base_common::parse_base_cmd_args(args, ABOUT, USAGE)?; + let config = base_common::parse_base_cmd_args(args, ABOUT, USAGE)?; - // Create a reference to stdin so we can return a locked stdin from - // parse_base_cmd_args - let stdin_raw = stdin(); - let mut input: Box = base_common::get_input(&config, &stdin_raw)?; + let mut input = base_common::get_input(&config)?; - base_common::handle_input( - &mut input, - format, - config.wrap_cols, - config.ignore_garbage, - config.decode, - ) + base_common::handle_input(&mut input, format, config) } pub fn uu_app() -> Command { diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index 53f507e0d9e..f6b88f55157 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -7,11 +7,11 @@ use clap::{crate_version, Arg, ArgAction, Command}; use std::fs::File; -use std::io::{self, ErrorKind, Read, Stdin}; -use std::path::Path; +use std::io::{self, ErrorKind, Read}; +use std::path::{Path, PathBuf}; use uucore::display::Quotable; use uucore::encoding::{ - for_fast_encode::{BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER}, + for_base_common::{BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER}, Format, Z85Wrapper, BASE2LSBF, BASE2MSBF, }; use uucore::encoding::{EncodingWrapper, SupportsFastDecodeAndEncode}; @@ -31,7 +31,7 @@ pub struct Config { pub decode: bool, pub ignore_garbage: bool, pub wrap_cols: Option, - pub to_read: Option, + pub to_read: Option, } pub mod options { @@ -43,9 +43,10 @@ pub mod options { impl Config { pub fn from(options: &clap::ArgMatches) -> UResult { - let file: Option = match options.get_many::(options::FILE) { + let to_read = match options.get_many::(options::FILE) { Some(mut values) => { let name = values.next().unwrap(); + if let Some(extra_op) = values.next() { return Err(UUsageError::new( BASE_CMD_PARSE_ERROR, @@ -56,19 +57,22 @@ impl Config { if name == "-" { None } else { - if !Path::exists(Path::new(name)) { + let path = Path::new(name); + + if !path.exists() { return Err(USimpleError::new( BASE_CMD_PARSE_ERROR, - format!("{}: No such file or directory", name.maybe_quote()), + format!("{}: No such file or directory", path.maybe_quote()), )); } - Some(name.clone()) + + Some(path.to_owned()) } } None => None, }; - let cols = options + let wrap_cols = options .get_one::(options::WRAP) .map(|num| { num.parse::().map_err(|_| { @@ -83,8 +87,8 @@ impl Config { Ok(Self { decode: options.get_flag(options::DECODE), ignore_garbage: options.get_flag(options::IGNORE_GARBAGE), - wrap_cols: cols, - to_read: file, + wrap_cols, + to_read, }) } } @@ -139,42 +143,43 @@ pub fn base_app(about: &'static str, usage: &str) -> Command { ) } -pub fn get_input<'a>(config: &Config, stdin_ref: &'a Stdin) -> UResult> { +pub fn get_input(config: &Config) -> UResult> { match &config.to_read { - Some(name) => { + Some(path_buf) => { // Do not buffer input, because buffering is handled by `fast_decode` and `fast_encode` - let file_buf = - File::open(Path::new(name)).map_err_context(|| name.maybe_quote().to_string())?; - Ok(Box::new(file_buf)) + let file = + File::open(path_buf).map_err_context(|| path_buf.maybe_quote().to_string())?; + + Ok(Box::new(file)) + } + None => { + let stdin_lock = io::stdin().lock(); + + Ok(Box::new(stdin_lock)) } - None => Ok(Box::new(stdin_ref.lock())), } } -pub fn handle_input( - input: &mut R, - format: Format, - wrap: Option, - ignore_garbage: bool, - decode: bool, -) -> UResult<()> { +pub fn handle_input(input: &mut R, format: Format, config: Config) -> UResult<()> { let supports_fast_decode_and_encode = get_supports_fast_decode_and_encode(format); + let supports_fast_decode_and_encode_ref = supports_fast_decode_and_encode.as_ref(); + let mut stdout_lock = io::stdout().lock(); - if decode { + if config.decode { fast_decode::fast_decode( input, &mut stdout_lock, - supports_fast_decode_and_encode.as_ref(), - ignore_garbage, + supports_fast_decode_and_encode_ref, + config.ignore_garbage, ) } else { fast_encode::fast_encode( input, &mut stdout_lock, - supports_fast_decode_and_encode.as_ref(), - wrap, + supports_fast_decode_and_encode_ref, + config.wrap_cols, ) } } @@ -423,15 +428,15 @@ pub mod fast_encode { }; // Start of buffers - // Data that was read from stdin + // Data that was read from `input` let mut input_buffer = vec![0; INPUT_BUFFER_SIZE]; assert!(!input_buffer.is_empty()); - // Data that was read from stdin but has not been encoded yet + // Data that was read from `input` but has not been encoded yet let mut leftover_buffer = VecDeque::::new(); - // Encoded data that needs to be written to output + // Encoded data that needs to be written to `output` let mut encoded_buffer = VecDeque::::new(); // End of buffers @@ -469,7 +474,7 @@ pub mod fast_encode { assert!(leftover_buffer.len() < encode_in_chunks_of_size); - // Write all data in `encoded_buffer` to output + // Write all data in `encoded_buffer` to `output` write_to_output(&mut line_wrapping, &mut encoded_buffer, &mut output, false)?; } Err(er) => { @@ -511,7 +516,7 @@ pub mod fast_decode { // Start of helper functions fn alphabet_to_table(alphabet: &[u8], ignore_garbage: bool) -> [bool; 256] { - // If "ignore_garbage" is enabled, all characters outside the alphabet are ignored + // If `ignore_garbage` is enabled, all characters outside the alphabet are ignored // If it is not enabled, only '\n' and '\r' are ignored if ignore_garbage { // Note: "false" here @@ -618,12 +623,12 @@ pub mod fast_decode { assert!(decode_in_chunks_of_size > 0); - // Note that it's not worth using "data-encoding"'s ignore functionality if "ignore_garbage" is true, because + // Note that it's not worth using "data-encoding"'s ignore functionality if `ignore_garbage` is true, because // "data-encoding"'s ignore functionality cannot discard non-ASCII bytes. The data has to be filtered before // passing it to "data-encoding", so there is no point in doing any filtering in "data-encoding". This also // allows execution to stay on the happy path in "data-encoding": // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L754-L756 - // Update: it is not even worth it to use "data-encoding"'s ignore functionality when "ignore_garbage" is + // It is also not worth using "data-encoding"'s ignore functionality when `ignore_garbage` is // false. // Note that the alphabet constants above already include the padding characters // TODO @@ -631,18 +636,18 @@ pub mod fast_decode { let table = alphabet_to_table(alphabet, ignore_garbage); // Start of buffers - // Data that was read from stdin + // Data that was read from `input` let mut input_buffer = vec![0; INPUT_BUFFER_SIZE]; assert!(!input_buffer.is_empty()); - // Data that was read from stdin but has not been decoded yet + // Data that was read from `input` but has not been decoded yet let mut leftover_buffer = Vec::::new(); // Decoded data that needs to be written to `output` let mut decoded_buffer = Vec::::new(); - // Buffer that will be used when "ignore_garbage" is true, and the chunk read from "input" contains garbage + // Buffer that will be used when `ignore_garbage` is true, and the chunk read from `input` contains garbage // data let mut non_garbage_buffer = Vec::::new(); // End of buffers diff --git a/src/uu/base64/Cargo.toml b/src/uu/base64/Cargo.toml index c65fe5b971e..5afc4283e6d 100644 --- a/src/uu/base64/Cargo.toml +++ b/src/uu/base64/Cargo.toml @@ -17,6 +17,7 @@ readme.workspace = true path = "src/base64.rs" [dependencies] +clap = { workspace = true } uucore = { workspace = true, features = ["encoding"] } uu_base32 = { workspace = true } diff --git a/src/uu/base64/src/base64.rs b/src/uu/base64/src/base64.rs index 6544638bdae..86eb75bf119 100644 --- a/src/uu/base64/src/base64.rs +++ b/src/uu/base64/src/base64.rs @@ -3,13 +3,10 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use clap::Command; use uu_base32::base_common; -pub use uu_base32::uu_app; - use uucore::{encoding::Format, error::UResult, help_about, help_usage}; -use std::io::{stdin, Read}; - const ABOUT: &str = help_about!("base64.md"); const USAGE: &str = help_usage!("base64.md"); @@ -17,18 +14,13 @@ const USAGE: &str = help_usage!("base64.md"); pub fn uumain(args: impl uucore::Args) -> UResult<()> { let format = Format::Base64; - let config: base_common::Config = base_common::parse_base_cmd_args(args, ABOUT, USAGE)?; + let config = base_common::parse_base_cmd_args(args, ABOUT, USAGE)?; - // Create a reference to stdin so we can return a locked stdin from - // parse_base_cmd_args - let stdin_raw = stdin(); - let mut input: Box = base_common::get_input(&config, &stdin_raw)?; + let mut input = base_common::get_input(&config)?; + + base_common::handle_input(&mut input, format, config) +} - base_common::handle_input( - &mut input, - format, - config.wrap_cols, - config.ignore_garbage, - config.decode, - ) +pub fn uu_app() -> Command { + base_common::base_app(ABOUT, USAGE) } diff --git a/src/uu/basenc/BENCHMARKING.md b/src/uu/basenc/BENCHMARKING.md index f007b82014b..8248cbbc53b 100644 --- a/src/uu/basenc/BENCHMARKING.md +++ b/src/uu/basenc/BENCHMARKING.md @@ -13,7 +13,7 @@ use a benchmarking tool like [hyperfine][0]. hyperfine currently does not measure maximum memory usage. Memory usage can be benchmarked using [poop][2], or [toybox][3]'s "time" subcommand (both are Linux only). -Next, build the `basenc` binary using the release profile: +Build the `basenc` binary using the release profile: ```Shell cargo build --package uu_basenc --profile release diff --git a/src/uu/basenc/src/basenc.rs b/src/uu/basenc/src/basenc.rs index ed117b22a0d..2de1223f4a1 100644 --- a/src/uu/basenc/src/basenc.rs +++ b/src/uu/basenc/src/basenc.rs @@ -3,19 +3,15 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -//spell-checker:ignore (args) lsbf msbf +// spell-checker:ignore lsbf msbf use clap::{Arg, ArgAction, Command}; use uu_base32::base_common::{self, Config, BASE_CMD_PARSE_ERROR}; - +use uucore::error::UClapError; use uucore::{ encoding::Format, error::{UResult, UUsageError}, }; - -use std::io::{stdin, Read}; -use uucore::error::UClapError; - use uucore::{help_about, help_usage}; const ABOUT: &str = help_about!("basenc.md"); @@ -81,16 +77,8 @@ fn parse_cmd_args(args: impl uucore::Args) -> UResult<(Config, Format)> { #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let (config, format) = parse_cmd_args(args)?; - // Create a reference to stdin so we can return a locked stdin from - // parse_base_cmd_args - let stdin_raw = stdin(); - let mut input: Box = base_common::get_input(&config, &stdin_raw)?; - base_common::handle_input( - &mut input, - format, - config.wrap_cols, - config.ignore_garbage, - config.decode, - ) + let mut input = base_common::get_input(&config)?; + + base_common::handle_input(&mut input, format, config) } diff --git a/src/uucore/src/lib/features/encoding.rs b/src/uucore/src/lib/features/encoding.rs index b27d3a1f9b5..b9150114b8a 100644 --- a/src/uucore/src/lib/features/encoding.rs +++ b/src/uucore/src/lib/features/encoding.rs @@ -11,8 +11,8 @@ use data_encoding::Encoding; use data_encoding_macro::new_encoding; use std::collections::VecDeque; -// Re-export for the faster encoding logic -pub mod for_fast_encode { +// Re-export for the faster decoding/encoding logic +pub mod for_base_common { pub use data_encoding::*; } diff --git a/tests/by-util/test_base64.rs b/tests/by-util/test_base64.rs index b0a89b4c2ae..f07da925f5b 100644 --- a/tests/by-util/test_base64.rs +++ b/tests/by-util/test_base64.rs @@ -185,3 +185,38 @@ cyBvdmVyIHRoZSBsYXp5IGRvZy4= // cSpell:enable ); } + +// Prevent regression to: +// +// ❯ coreutils manpage base64 | rg --fixed-strings -- 'base32' +// The data are encoded as described for the base32 alphabet in RFC 4648. +// to the bytes of the formal base32 alphabet. Use \-\-ignore\-garbage +// The data are encoded as described for the base32 alphabet in RFC 4648. +// to the bytes of the formal base32 alphabet. Use \-\-ignore\-garbage +#[test] +fn test_manpage() { + use std::process::{Command, Stdio}; + + let test_scenario = TestScenario::new(""); + + let child = Command::new(test_scenario.bin_path) + .arg("manpage") + .arg("base64") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .unwrap(); + + let output = child.wait_with_output().unwrap(); + + assert_eq!(output.status.code().unwrap(), 0); + + assert!(output.stderr.is_empty()); + + let stdout_str = std::str::from_utf8(&output.stdout).unwrap(); + + assert!(stdout_str.contains("base64 alphabet")); + + assert!(!stdout_str.to_ascii_lowercase().contains("base32")); +} diff --git a/tests/by-util/test_basenc.rs b/tests/by-util/test_basenc.rs index 437413a26b7..85c05ad3ee0 100644 --- a/tests/by-util/test_basenc.rs +++ b/tests/by-util/test_basenc.rs @@ -3,7 +3,8 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -//spell-checker: ignore (encodings) lsbf msbf +// spell-checker: ignore (encodings) lsbf msbf + use crate::common::util::TestScenario; #[test]