diff --git a/.vscode/cspell.dictionaries/shell.wordlist.txt b/.vscode/cspell.dictionaries/shell.wordlist.txt index 95dea94a7cd..11ce341addf 100644 --- a/.vscode/cspell.dictionaries/shell.wordlist.txt +++ b/.vscode/cspell.dictionaries/shell.wordlist.txt @@ -25,6 +25,7 @@ sudoedit tcsh tzselect urandom +VARNAME wtmp zsh diff --git a/Cargo.lock b/Cargo.lock index fcdb2374177..457859b64a1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -547,7 +547,7 @@ dependencies = [ "lazy_static", "proc-macro2", "regex", - "syn 2.0.23", + "syn 2.0.32", "unicode-xid", ] @@ -559,7 +559,7 @@ checksum = "3e1a2532e4ed4ea13031c13bc7bc0dbca4aae32df48e9d77f0d1e743179f2ea1" dependencies = [ "lazy_static", "proc-macro2", - "syn 2.0.23", + "syn 2.0.32", ] [[package]] @@ -574,7 +574,7 @@ dependencies = [ "lazy_static", "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.32", ] [[package]] @@ -942,7 +942,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.32", ] [[package]] @@ -1791,7 +1791,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.23", + "syn 2.0.32", "unicode-ident", ] @@ -1902,9 +1902,23 @@ checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" [[package]] name = "serde" -version = "1.0.147" +version = "1.0.193" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" +checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.193" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.32", +] [[package]] name = "sha1" @@ -2039,9 +2053,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.23" +version = "2.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737" +checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" dependencies = [ "proc-macro2", "quote", @@ -3296,7 +3310,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.32", "wasm-bindgen-shared", ] @@ -3318,7 +3332,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.32", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/src/uu/env/src/env.rs b/src/uu/env/src/env.rs index 1680d4f4d5b..966ce2faea6 100644 --- a/src/uu/env/src/env.rs +++ b/src/uu/env/src/env.rs @@ -5,18 +5,32 @@ // spell-checker:ignore (ToDO) chdir execvp progname subcommand subcommands unsets setenv putenv spawnp SIGSEGV SIGBUS sigaction +pub mod native_int_str; +pub mod parse_error; +pub mod split_iterator; +pub mod string_expander; +pub mod string_parser; +pub mod variable_parser; + +use clap::builder::ValueParser; use clap::{crate_name, crate_version, Arg, ArgAction, Command}; use ini::Ini; +use native_int_str::{ + from_native_int_representation_owned, Convert, NCvt, NativeIntStr, NativeIntString, NativeStr, +}; #[cfg(unix)] use nix::sys::signal::{raise, sigaction, SaFlags, SigAction, SigHandler, SigSet, Signal}; use std::borrow::Cow; use std::env; +use std::ffi::{OsStr, OsString}; use std::io::{self, Write}; +use std::ops::Deref; + #[cfg(unix)] use std::os::unix::process::ExitStatusExt; -use std::process; +use std::process::{self}; use uucore::display::Quotable; -use uucore::error::{UClapError, UResult, USimpleError, UUsageError}; +use uucore::error::{ExitCode, UError, UResult, USimpleError, UUsageError}; use uucore::line_ending::LineEnding; use uucore::{format_usage, help_about, help_section, help_usage, show_warning}; @@ -24,14 +38,16 @@ const ABOUT: &str = help_about!("env.md"); const USAGE: &str = help_usage!("env.md"); const AFTER_HELP: &str = help_section!("after help", "env.md"); +const ERROR_MSG_S_SHEBANG: &str = "use -[v]S to pass options in shebang lines"; + struct Options<'a> { ignore_env: bool, line_ending: LineEnding, - running_directory: Option<&'a str>, - files: Vec<&'a str>, - unsets: Vec<&'a str>, - sets: Vec<(&'a str, &'a str)>, - program: Vec<&'a str>, + running_directory: Option<&'a OsStr>, + files: Vec<&'a OsStr>, + unsets: Vec<&'a OsStr>, + sets: Vec<(Cow<'a, OsStr>, Cow<'a, OsStr>)>, + program: Vec<&'a OsStr>, } // print name=value env pairs on screen @@ -44,13 +60,13 @@ fn print_env(line_ending: LineEnding) { } } -fn parse_name_value_opt<'a>(opts: &mut Options<'a>, opt: &'a str) -> UResult { +fn parse_name_value_opt<'a>(opts: &mut Options<'a>, opt: &'a OsStr) -> UResult { // is it a NAME=VALUE like opt ? - if let Some(idx) = opt.find('=') { + let wrap = NativeStr::<'a>::new(opt); + let split_o = wrap.split_once(&'='); + if let Some((name, value)) = split_o { // yes, so push name, value pair - let (name, value) = opt.split_at(idx); - opts.sets.push((name, &value['='.len_utf8()..])); - + opts.sets.push((name, value)); Ok(false) } else { // no, it's a program-like opt @@ -58,7 +74,7 @@ fn parse_name_value_opt<'a>(opts: &mut Options<'a>, opt: &'a str) -> UResult(opts: &mut Options<'a>, opt: &'a str) -> UResult<()> { +fn parse_program_opt<'a>(opts: &mut Options<'a>, opt: &'a OsStr) -> UResult<()> { if opts.line_ending == LineEnding::Nul { Err(UUsageError::new( 125, @@ -96,23 +112,6 @@ fn load_config_file(opts: &mut Options) -> UResult<()> { Ok(()) } -#[cfg(not(windows))] -#[allow(clippy::ptr_arg)] -fn build_command<'a, 'b>(args: &'a Vec<&'b str>) -> (Cow<'b, str>, &'a [&'b str]) { - let progname = Cow::from(args[0]); - (progname, &args[1..]) -} - -#[cfg(windows)] -fn build_command<'a, 'b>(args: &'a mut Vec<&'b str>) -> (Cow<'b, str>, &'a [&'b str]) { - args.insert(0, "/d/c"); - let progname = env::var("ComSpec") - .map(Cow::from) - .unwrap_or_else(|_| Cow::from("cmd")); - - (progname, &args[..]) -} - pub fn uu_app() -> Command { Command::new(crate_name!()) .version(crate_version!()) @@ -134,6 +133,7 @@ pub fn uu_app() -> Command { .long("chdir") .number_of_values(1) .value_name("DIR") + .value_parser(ValueParser::os_string()) .value_hint(clap::ValueHint::DirPath) .help("change working directory to DIR"), ) @@ -153,6 +153,7 @@ pub fn uu_app() -> Command { .long("file") .value_name("PATH") .value_hint(clap::ValueHint::FilePath) + .value_parser(ValueParser::os_string()) .action(ArgAction::Append) .help( "read and set variables from a \".env\"-style configuration file \ @@ -165,193 +166,362 @@ pub fn uu_app() -> Command { .long("unset") .value_name("NAME") .action(ArgAction::Append) + .value_parser(ValueParser::os_string()) .help("remove variable from the environment"), ) - .arg(Arg::new("vars").action(ArgAction::Append)) + .arg( + Arg::new("debug") + .short('v') + .long("debug") + .action(ArgAction::SetTrue) + .help("print verbose information for each processing step"), + ) + .arg( + Arg::new("split-string") // split string handling is implemented directly, not using CLAP. But this entry here is needed for the help information output. + .short('S') + .long("split-string") + .value_name("S") + .action(ArgAction::Set) + .value_parser(ValueParser::os_string()) + .help("process and split S into separate arguments; used to pass multiple arguments on shebang lines") + ) + .arg( + Arg::new("vars") + .action(ArgAction::Append) + .value_parser(ValueParser::os_string()) + ) } -#[allow(clippy::cognitive_complexity)] -fn run_env(args: impl uucore::Args) -> UResult<()> { - let app = uu_app(); - let matches = app.try_get_matches_from(args).with_exit_code(125)?; - - let ignore_env = matches.get_flag("ignore-environment"); - let line_ending = LineEnding::from_zero_flag(matches.get_flag("null")); - let running_directory = matches.get_one::("chdir").map(|s| s.as_str()); - let files = match matches.get_many::("file") { - Some(v) => v.map(|s| s.as_str()).collect(), - None => Vec::with_capacity(0), - }; - let unsets = match matches.get_many::("unset") { - Some(v) => v.map(|s| s.as_str()).collect(), - None => Vec::with_capacity(0), - }; - - let mut opts = Options { - ignore_env, - line_ending, - running_directory, - files, - unsets, - sets: vec![], - program: vec![], - }; - - // change directory - if let Some(d) = opts.running_directory { - match env::set_current_dir(d) { - Ok(()) => d, - Err(error) => { - return Err(USimpleError::new( - 125, - format!("cannot change directory to \"{d}\": {error}"), - )); - } - }; +pub fn parse_args_from_str(text: &NativeIntStr) -> UResult> { + split_iterator::split(text).map_err(|e| match e { + parse_error::ParseError::BackslashCNotAllowedInDoubleQuotes { pos: _ } => { + USimpleError::new(125, "'\\c' must not appear in double-quoted -S string") + } + parse_error::ParseError::InvalidBackslashAtEndOfStringInMinusS { pos: _, quoting: _ } => { + USimpleError::new(125, "invalid backslash at end of string in -S") + } + parse_error::ParseError::InvalidSequenceBackslashXInMinusS { pos: _, c } => { + USimpleError::new(125, format!("invalid sequence '\\{}' in -S", c)) + } + parse_error::ParseError::MissingClosingQuote { pos: _, c: _ } => { + USimpleError::new(125, "no terminating quote in -S string") + } + parse_error::ParseError::ParsingOfVariableNameFailed { pos, msg } => { + USimpleError::new(125, format!("variable name issue (at {}): {}", pos, msg,)) + } + _ => USimpleError::new(125, format!("Error: {:?}", e)), + }) +} + +fn debug_print_args(args: &[OsString]) { + eprintln!("input args:"); + for (i, arg) in args.iter().enumerate() { + eprintln!("arg[{}]: {}", i, arg.quote()); } +} - let mut begin_prog_opts = false; - if let Some(mut iter) = matches.get_many::("vars") { - // read NAME=VALUE arguments (and up to a single program argument) - while !begin_prog_opts { - if let Some(opt) = iter.next() { - if opt == "-" { - opts.ignore_env = true; - } else { - begin_prog_opts = parse_name_value_opt(&mut opts, opt)?; - } - } else { - break; - } +fn check_and_handle_string_args( + arg: &OsString, + prefix_to_test: &str, + all_args: &mut Vec, + do_debug_print_args: Option<&Vec>, +) -> UResult { + let native_arg = NCvt::convert(arg); + if let Some(remaining_arg) = native_arg.strip_prefix(&*NCvt::convert(prefix_to_test)) { + if let Some(input_args) = do_debug_print_args { + debug_print_args(input_args); // do it here, such that its also printed when we get an error/panic during parsing } - // read any leftover program arguments - for opt in iter { - parse_program_opt(&mut opts, opt)?; + let arg_strings = parse_args_from_str(remaining_arg)?; + all_args.extend( + arg_strings + .into_iter() + .map(from_native_int_representation_owned), + ); + + Ok(true) + } else { + Ok(false) + } +} + +#[derive(Default)] +struct EnvAppData { + do_debug_printing: bool, + had_string_argument: bool, +} + +impl EnvAppData { + fn make_error_no_such_file_or_dir(&self, prog: &OsStr) -> Box { + uucore::show_error!("{}: No such file or directory", prog.quote()); + if !self.had_string_argument { + uucore::show_error!("{}", ERROR_MSG_S_SHEBANG); } + ExitCode::new(127) } - // GNU env tests this behavior - if opts.program.is_empty() && running_directory.is_some() { - return Err(UUsageError::new( - 125, - "must specify command with --chdir (-C)".to_string(), - )); + fn process_all_string_arguments( + &mut self, + original_args: &Vec, + ) -> UResult> { + let mut all_args: Vec = Vec::new(); + for arg in original_args { + match arg { + b if check_and_handle_string_args(b, "--split-string", &mut all_args, None)? => { + self.had_string_argument = true; + } + b if check_and_handle_string_args(b, "-S", &mut all_args, None)? => { + self.had_string_argument = true; + } + b if check_and_handle_string_args( + b, + "-vS", + &mut all_args, + Some(original_args), + )? => + { + self.do_debug_printing = true; + self.had_string_argument = true; + } + _ => { + all_args.push(arg.clone()); + } + } + } + + Ok(all_args) } - // NOTE: we manually set and unset the env vars below rather than using Command::env() to more - // easily handle the case where no command is given + #[allow(clippy::cognitive_complexity)] + fn run_env(&mut self, original_args: impl uucore::Args) -> UResult<()> { + let original_args: Vec = original_args.collect(); + let args = self.process_all_string_arguments(&original_args)?; + + let app = uu_app(); + let matches = app + .try_get_matches_from(args) + .map_err(|e| -> Box { + match e.kind() { + clap::error::ErrorKind::DisplayHelp + | clap::error::ErrorKind::DisplayVersion => e.into(), + _ => { + // extent any real issue with parameter parsing by the ERROR_MSG_S_SHEBANG + let s = format!("{}", e); + if !s.is_empty() { + let s = s.trim_end(); + uucore::show_error!("{}", s); + } + uucore::show_error!("{}", ERROR_MSG_S_SHEBANG); + uucore::error::ExitCode::new(125) + } + } + })?; - // remove all env vars if told to ignore presets - if opts.ignore_env { - for (ref name, _) in env::vars() { - env::remove_var(name); + let did_debug_printing_before = self.do_debug_printing; // could have been done already as part of the "-vS" string parsing + let do_debug_printing = self.do_debug_printing || matches.get_flag("debug"); + if do_debug_printing && !did_debug_printing_before { + debug_print_args(&original_args); } - } - // load .env-style config file prior to those given on the command-line - load_config_file(&mut opts)?; + let ignore_env = matches.get_flag("ignore-environment"); + let line_ending = LineEnding::from_zero_flag(matches.get_flag("null")); + let running_directory = matches.get_one::("chdir").map(|s| s.as_os_str()); + let files = match matches.get_many::("file") { + Some(v) => v.map(|s| s.as_os_str()).collect(), + None => Vec::with_capacity(0), + }; + let unsets = match matches.get_many::("unset") { + Some(v) => v.map(|s| s.as_os_str()).collect(), + None => Vec::with_capacity(0), + }; + + let mut opts = Options { + ignore_env, + line_ending, + running_directory, + files, + unsets, + sets: vec![], + program: vec![], + }; + + // change directory + if let Some(d) = opts.running_directory { + match env::set_current_dir(d) { + Ok(()) => d, + Err(error) => { + return Err(USimpleError::new( + 125, + format!("cannot change directory to {}: {error}", d.quote()), + )); + } + }; + } - // unset specified env vars - for name in &opts.unsets { - if name.is_empty() || name.contains(0 as char) || name.contains('=') { - return Err(USimpleError::new( + let mut begin_prog_opts = false; + if let Some(mut iter) = matches.get_many::("vars") { + // read NAME=VALUE arguments (and up to a single program argument) + while !begin_prog_opts { + if let Some(opt) = iter.next() { + if opt == "-" { + opts.ignore_env = true; + } else { + begin_prog_opts = parse_name_value_opt(&mut opts, opt)?; + } + } else { + break; + } + } + + // read any leftover program arguments + for opt in iter { + parse_program_opt(&mut opts, opt)?; + } + } + + // GNU env tests this behavior + if opts.program.is_empty() && running_directory.is_some() { + return Err(UUsageError::new( 125, - format!("cannot unset {}: Invalid argument", name.quote()), + "must specify command with --chdir (-C)".to_string(), )); } - env::remove_var(name); - } + // NOTE: we manually set and unset the env vars below rather than using Command::env() to more + // easily handle the case where no command is given - // set specified env vars - for &(name, val) in &opts.sets { - /* - * set_var panics if name is an empty string - * set_var internally calls setenv (on unix at least), while GNU env calls putenv instead. - * - * putenv returns successfully if provided with something like "=a" and modifies the environ - * variable to contain "=a" inside it, effectively modifying the process' current environment - * to contain a malformed string in it. Using GNU's implementation, the command `env =a` - * prints out the malformed string and even invokes the child process with that environment. - * This can be seen by using `env -i =a env` or `env -i =a cat /proc/self/environ` - * - * POSIX.1-2017 doesn't seem to mention what to do if the string is malformed (at least - * not in "Chapter 8, Environment Variables" or in the definition for environ and various - * exec*'s or in the description of env in the "Shell & Utilities" volume). - * - * It also doesn't specify any checks for putenv before modifying the environ variable, which - * is likely why glibc doesn't do so. However, the first set_var argument cannot point to - * an empty string or a string containing '='. - * - * There is no benefit in replicating GNU's env behavior, since it will only modify the - * environment in weird ways - */ - - if name.is_empty() { - show_warning!("no name specified for value {}", val.quote()); - continue; + // remove all env vars if told to ignore presets + if opts.ignore_env { + for (ref name, _) in env::vars_os() { + env::remove_var(name); + } } - env::set_var(name, val); - } - if opts.program.is_empty() { - // no program provided, so just dump all env vars to stdout - print_env(opts.line_ending); - } else { - // we need to execute a command - #[cfg(windows)] - let (prog, args) = build_command(&mut opts.program); - #[cfg(not(windows))] - let (prog, args) = build_command(&opts.program); - - /* - * On Unix-like systems Command::status either ends up calling either fork or posix_spawnp - * (which ends up calling clone). Keep using the current process would be ideal, but the - * standard library contains many checks and fail-safes to ensure the process ends up being - * created. This is much simpler than dealing with the hassles of calling execvp directly. - */ - match process::Command::new(&*prog).args(args).status() { - Ok(exit) if !exit.success() => { - #[cfg(unix)] - if let Some(exit_code) = exit.code() { - return Err(exit_code.into()); - } else { - // `exit.code()` returns `None` on Unix when the process is terminated by a signal. - // See std::os::unix::process::ExitStatusExt for more information. This prints out - // the interrupted process and the signal it received. - let signal_code = exit.signal().unwrap(); - let signal = Signal::try_from(signal_code).unwrap(); - - // We have to disable any handler that's installed by default. - // This ensures that we exit on this signal. - // For example, `SIGSEGV` and `SIGBUS` have default handlers installed in Rust. - // We ignore the errors because there is not much we can do if that fails anyway. - // SAFETY: The function is unsafe because installing functions is unsafe, but we are - // just defaulting to default behavior and not installing a function. Hence, the call - // is safe. - let _ = unsafe { - sigaction( - signal, - &SigAction::new(SigHandler::SigDfl, SaFlags::empty(), SigSet::all()), - ) - }; - - let _ = raise(signal); + // load .env-style config file prior to those given on the command-line + load_config_file(&mut opts)?; + + // unset specified env vars + for name in &opts.unsets { + let native_name = NativeStr::new(name); + if name.is_empty() + || native_name.contains(&'\0').unwrap() + || native_name.contains(&'=').unwrap() + { + return Err(USimpleError::new( + 125, + format!("cannot unset {}: Invalid argument", name.quote()), + )); + } + + env::remove_var(name); + } + + // set specified env vars + for (name, val) in &opts.sets { + /* + * set_var panics if name is an empty string + * set_var internally calls setenv (on unix at least), while GNU env calls putenv instead. + * + * putenv returns successfully if provided with something like "=a" and modifies the environ + * variable to contain "=a" inside it, effectively modifying the process' current environment + * to contain a malformed string in it. Using GNU's implementation, the command `env =a` + * prints out the malformed string and even invokes the child process with that environment. + * This can be seen by using `env -i =a env` or `env -i =a cat /proc/self/environ` + * + * POSIX.1-2017 doesn't seem to mention what to do if the string is malformed (at least + * not in "Chapter 8, Environment Variables" or in the definition for environ and various + * exec*'s or in the description of env in the "Shell & Utilities" volume). + * + * It also doesn't specify any checks for putenv before modifying the environ variable, which + * is likely why glibc doesn't do so. However, the first set_var argument cannot point to + * an empty string or a string containing '='. + * + * There is no benefit in replicating GNU's env behavior, since it will only modify the + * environment in weird ways + */ + + if name.is_empty() { + show_warning!("no name specified for value {}", val.quote()); + continue; + } + env::set_var(name, val); + } + + if opts.program.is_empty() { + // no program provided, so just dump all env vars to stdout + print_env(opts.line_ending); + } else { + // we need to execute a command + let prog = Cow::from(opts.program[0]); + let args = &opts.program[1..]; + + if do_debug_printing { + eprintln!("executable: {}", prog.quote()); + for (i, arg) in args.iter().enumerate() { + eprintln!("arg[{}]: {}", i, arg.quote()); } - #[cfg(not(unix))] - return Err(exit.code().unwrap().into()); } - Err(ref err) if err.kind() == io::ErrorKind::NotFound => return Err(127.into()), - Err(_) => return Err(126.into()), - Ok(_) => (), + + /* + * On Unix-like systems Command::status either ends up calling either fork or posix_spawnp + * (which ends up calling clone). Keep using the current process would be ideal, but the + * standard library contains many checks and fail-safes to ensure the process ends up being + * created. This is much simpler than dealing with the hassles of calling execvp directly. + */ + match process::Command::new(&*prog).args(args).status() { + Ok(exit) if !exit.success() => { + #[cfg(unix)] + if let Some(exit_code) = exit.code() { + return Err(exit_code.into()); + } else { + // `exit.code()` returns `None` on Unix when the process is terminated by a signal. + // See std::os::unix::process::ExitStatusExt for more information. This prints out + // the interrupted process and the signal it received. + let signal_code = exit.signal().unwrap(); + let signal = Signal::try_from(signal_code).unwrap(); + + // We have to disable any handler that's installed by default. + // This ensures that we exit on this signal. + // For example, `SIGSEGV` and `SIGBUS` have default handlers installed in Rust. + // We ignore the errors because there is not much we can do if that fails anyway. + // SAFETY: The function is unsafe because installing functions is unsafe, but we are + // just defaulting to default behavior and not installing a function. Hence, the call + // is safe. + let _ = unsafe { + sigaction( + signal, + &SigAction::new( + SigHandler::SigDfl, + SaFlags::empty(), + SigSet::all(), + ), + ) + }; + + let _ = raise(signal); + } + #[cfg(not(unix))] + return Err(exit.code().unwrap().into()); + } + Err(ref err) + if (err.kind() == io::ErrorKind::NotFound) + || (err.kind() == io::ErrorKind::InvalidInput) => + { + return Err(self.make_error_no_such_file_or_dir(prog.deref())); + } + Err(e) => { + uucore::show_error!("unknown error: {:?}", e); + return Err(126.into()); + } + Ok(_) => (), + } } - } - Ok(()) + Ok(()) + } } #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { - run_env(args) + EnvAppData::default().run_env(args) } diff --git a/src/uu/env/src/native_int_str.rs b/src/uu/env/src/native_int_str.rs new file mode 100644 index 00000000000..dc1e741e1e1 --- /dev/null +++ b/src/uu/env/src/native_int_str.rs @@ -0,0 +1,325 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +// This module contains classes and functions for dealing with the differences +// between operating systems regarding the lossless processing of OsStr/OsString. +// In contrast to existing crates with similar purpose, this module does not use any +// `unsafe` features or functions. +// Due to a suboptimal design aspect of OsStr/OsString on windows, we need to +// encode/decode to wide chars on windows operating system. +// This prevents borrowing from OsStr on windows. Anyway, if optimally used,# +// this conversion needs to be done only once in the beginning and at the end. + +use std::ffi::OsString; +#[cfg(not(target_os = "windows"))] +use std::os::unix::ffi::{OsStrExt, OsStringExt}; +#[cfg(target_os = "windows")] +use std::os::windows::prelude::*; +use std::{borrow::Cow, ffi::OsStr}; + +#[cfg(target_os = "windows")] +use u16 as NativeIntCharU; +#[cfg(not(target_os = "windows"))] +use u8 as NativeIntCharU; + +pub type NativeCharInt = NativeIntCharU; +pub type NativeIntStr = [NativeCharInt]; +pub type NativeIntString = Vec; + +pub struct NCvt; + +pub trait Convert { + fn convert(f: From) -> To; +} + +// ================ str/String ================= + +impl<'a> Convert<&'a str, Cow<'a, NativeIntStr>> for NCvt { + fn convert(f: &'a str) -> Cow<'a, NativeIntStr> { + #[cfg(target_os = "windows")] + { + Cow::Owned(f.encode_utf16().collect()) + } + + #[cfg(not(target_os = "windows"))] + { + Cow::Borrowed(f.as_bytes()) + } + } +} + +impl<'a> Convert<&'a String, Cow<'a, NativeIntStr>> for NCvt { + fn convert(f: &'a String) -> Cow<'a, NativeIntStr> { + #[cfg(target_os = "windows")] + { + Cow::Owned(f.encode_utf16().collect()) + } + + #[cfg(not(target_os = "windows"))] + { + Cow::Borrowed(f.as_bytes()) + } + } +} + +impl<'a> Convert> for NCvt { + fn convert(f: String) -> Cow<'a, NativeIntStr> { + #[cfg(target_os = "windows")] + { + Cow::Owned(f.encode_utf16().collect()) + } + + #[cfg(not(target_os = "windows"))] + { + Cow::Owned(f.into_bytes()) + } + } +} + +// ================ OsStr/OsString ================= + +impl<'a> Convert<&'a OsStr, Cow<'a, NativeIntStr>> for NCvt { + fn convert(f: &'a OsStr) -> Cow<'a, NativeIntStr> { + to_native_int_representation(f) + } +} + +impl<'a> Convert<&'a OsString, Cow<'a, NativeIntStr>> for NCvt { + fn convert(f: &'a OsString) -> Cow<'a, NativeIntStr> { + to_native_int_representation(f) + } +} + +impl<'a> Convert> for NCvt { + fn convert(f: OsString) -> Cow<'a, NativeIntStr> { + #[cfg(target_os = "windows")] + { + Cow::Owned(f.encode_wide().collect()) + } + + #[cfg(not(target_os = "windows"))] + { + Cow::Owned(f.into_vec()) + } + } +} + +// ================ Vec ================= + +impl<'a> Convert<&'a Vec<&'a str>, Vec>> for NCvt { + fn convert(f: &'a Vec<&'a str>) -> Vec> { + f.iter().map(|x| Self::convert(*x)).collect() + } +} + +impl<'a> Convert, Vec>> for NCvt { + fn convert(f: Vec<&'a str>) -> Vec> { + f.iter().map(|x| Self::convert(*x)).collect() + } +} + +impl<'a> Convert<&'a Vec, Vec>> for NCvt { + fn convert(f: &'a Vec) -> Vec> { + f.iter().map(Self::convert).collect() + } +} + +impl<'a> Convert, Vec>> for NCvt { + fn convert(f: Vec) -> Vec> { + f.into_iter().map(Self::convert).collect() + } +} + +pub fn to_native_int_representation(input: &OsStr) -> Cow<'_, NativeIntStr> { + #[cfg(target_os = "windows")] + { + Cow::Owned(input.encode_wide().collect()) + } + + #[cfg(not(target_os = "windows"))] + { + Cow::Borrowed(input.as_bytes()) + } +} + +#[allow(clippy::needless_pass_by_value)] // needed on windows +pub fn from_native_int_representation(input: Cow<'_, NativeIntStr>) -> Cow<'_, OsStr> { + #[cfg(target_os = "windows")] + { + Cow::Owned(OsString::from_wide(&input)) + } + + #[cfg(not(target_os = "windows"))] + { + match input { + Cow::Borrowed(borrow) => Cow::Borrowed(OsStr::from_bytes(borrow)), + Cow::Owned(own) => Cow::Owned(OsString::from_vec(own)), + } + } +} + +#[allow(clippy::needless_pass_by_value)] // needed on windows +pub fn from_native_int_representation_owned(input: NativeIntString) -> OsString { + #[cfg(target_os = "windows")] + { + OsString::from_wide(&input) + } + + #[cfg(not(target_os = "windows"))] + { + OsString::from_vec(input) + } +} + +pub fn get_single_native_int_value(c: &char) -> Option { + #[cfg(target_os = "windows")] + { + let mut buf = [0u16, 0]; + let s = c.encode_utf16(&mut buf); + if s.len() == 1 { + Some(buf[0]) + } else { + None + } + } + + #[cfg(not(target_os = "windows"))] + { + let mut buf = [0u8, 0, 0, 0]; + let s = c.encode_utf8(&mut buf); + if s.len() == 1 { + Some(buf[0]) + } else { + None + } + } +} + +pub fn get_char_from_native_int(ni: NativeCharInt) -> Option<(char, NativeCharInt)> { + let c_opt; + #[cfg(target_os = "windows")] + { + c_opt = char::decode_utf16([ni; 1]).next().unwrap().ok(); + }; + + #[cfg(not(target_os = "windows"))] + { + c_opt = std::str::from_utf8(&[ni; 1]) + .ok() + .map(|x| x.chars().next().unwrap()); + }; + + if let Some(c) = c_opt { + return Some((c, ni)); + } + + None +} + +pub struct NativeStr<'a> { + native: Cow<'a, NativeIntStr>, +} + +impl<'a> NativeStr<'a> { + pub fn new(str: &'a OsStr) -> Self { + Self { + native: to_native_int_representation(str), + } + } + + pub fn native(&self) -> Cow<'a, NativeIntStr> { + self.native.clone() + } + + pub fn into_native(self) -> Cow<'a, NativeIntStr> { + self.native + } + + pub fn contains(&self, x: &char) -> Option { + let n_c = get_single_native_int_value(x)?; + Some(self.native.contains(&n_c)) + } + + pub fn slice(&self, from: usize, to: usize) -> Cow<'a, OsStr> { + let result = self.match_cow(|b| Ok::<_, ()>(&b[from..to]), |o| Ok(o[from..to].to_vec())); + result.unwrap() + } + + pub fn split_once(&self, pred: &char) -> Option<(Cow<'a, OsStr>, Cow<'a, OsStr>)> { + let n_c = get_single_native_int_value(pred)?; + let p = self.native.iter().position(|&x| x == n_c)?; + let before = self.slice(0, p); + let after = self.slice(p + 1, self.native.len()); + Some((before, after)) + } + + pub fn split_at(&self, pos: usize) -> (Cow<'a, OsStr>, Cow<'a, OsStr>) { + let before = self.slice(0, pos); + let after = self.slice(pos, self.native.len()); + (before, after) + } + + pub fn strip_prefix(&self, prefix: &OsStr) -> Option> { + let n_prefix = to_native_int_representation(prefix); + let result = self.match_cow( + |b| b.strip_prefix(&*n_prefix).ok_or(()), + |o| o.strip_prefix(&*n_prefix).map(|x| x.to_vec()).ok_or(()), + ); + result.ok() + } + + pub fn strip_prefix_native(&self, prefix: &OsStr) -> Option> { + let n_prefix = to_native_int_representation(prefix); + let result = self.match_cow_native( + |b| b.strip_prefix(&*n_prefix).ok_or(()), + |o| o.strip_prefix(&*n_prefix).map(|x| x.to_vec()).ok_or(()), + ); + result.ok() + } + + fn match_cow( + &self, + f_borrow: FnBorrow, + f_owned: FnOwned, + ) -> Result, Err> + where + FnBorrow: FnOnce(&'a [NativeCharInt]) -> Result<&'a [NativeCharInt], Err>, + FnOwned: FnOnce(&Vec) -> Result, Err>, + { + match &self.native { + Cow::Borrowed(b) => { + let slice = f_borrow(b); + let os_str = slice.map(|x| from_native_int_representation(Cow::Borrowed(x))); + os_str + } + Cow::Owned(o) => { + let slice = f_owned(o); + let os_str = slice.map(from_native_int_representation_owned); + os_str.map(Cow::Owned) + } + } + } + + fn match_cow_native( + &self, + f_borrow: FnBorrow, + f_owned: FnOwned, + ) -> Result, Err> + where + FnBorrow: FnOnce(&'a [NativeCharInt]) -> Result<&'a [NativeCharInt], Err>, + FnOwned: FnOnce(&Vec) -> Result, Err>, + { + match &self.native { + Cow::Borrowed(b) => { + let slice = f_borrow(b); + slice.map(Cow::Borrowed) + } + Cow::Owned(o) => { + let slice = f_owned(o); + slice.map(Cow::Owned) + } + } + } +} diff --git a/src/uu/env/src/parse_error.rs b/src/uu/env/src/parse_error.rs new file mode 100644 index 00000000000..cbdba99ed90 --- /dev/null +++ b/src/uu/env/src/parse_error.rs @@ -0,0 +1,55 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::fmt; + +use crate::string_parser; + +/// An error returned when string arg splitting fails. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ParseError { + MissingClosingQuote { + pos: usize, + c: char, + }, + InvalidBackslashAtEndOfStringInMinusS { + pos: usize, + quoting: String, + }, + BackslashCNotAllowedInDoubleQuotes { + pos: usize, + }, + InvalidSequenceBackslashXInMinusS { + pos: usize, + c: char, + }, + ParsingOfVariableNameFailed { + pos: usize, + msg: String, + }, + InternalError { + pos: usize, + sub_err: string_parser::Error, + }, + ReachedEnd, + ContinueWithDelimiter, +} + +impl fmt::Display for ParseError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(format!("{:?}", self).as_str()) + } +} + +impl std::error::Error for ParseError {} + +impl From for ParseError { + fn from(value: string_parser::Error) -> Self { + Self::InternalError { + pos: value.peek_position, + sub_err: value, + } + } +} diff --git a/src/uu/env/src/split_iterator.rs b/src/uu/env/src/split_iterator.rs new file mode 100644 index 00000000000..0af7a78ad8a --- /dev/null +++ b/src/uu/env/src/split_iterator.rs @@ -0,0 +1,375 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. +// +// This file is based on work from Tomasz Miąsko who published it as "shell_words" crate, +// licensed under the Apache License, Version 2.0 +// or the MIT license , at your option. +// +//! Process command line according to parsing rules of original GNU env. +//! Even though it looks quite like a POSIX syntax, the original +//! "shell_words" implementation had to be adapted significantly. +//! +//! Apart from the grammar differences, there is a new feature integrated: $VARIABLE expansion. +//! +//! [GNU env] +// spell-checker:ignore (words) Tomasz Miąsko rntfv FFFD varname + +#![forbid(unsafe_code)] + +use std::borrow::Cow; + +use crate::native_int_str::from_native_int_representation; +use crate::native_int_str::NativeCharInt; +use crate::native_int_str::NativeIntStr; +use crate::native_int_str::NativeIntString; +use crate::parse_error::ParseError; +use crate::string_expander::StringExpander; +use crate::string_parser::StringParser; +use crate::variable_parser::VariableParser; + +const BACKSLASH: char = '\\'; +const DOUBLE_QUOTES: char = '\"'; +const SINGLE_QUOTES: char = '\''; +const NEW_LINE: char = '\n'; +const DOLLAR: char = '$'; + +const REPLACEMENTS: [(char, char); 9] = [ + ('r', '\r'), + ('n', '\n'), + ('t', '\t'), + ('f', '\x0C'), + ('v', '\x0B'), + ('_', ' '), + ('#', '#'), + ('$', '$'), + ('"', '"'), +]; + +const ASCII_WHITESPACE_CHARS: [char; 6] = [' ', '\t', '\r', '\n', '\x0B', '\x0C']; + +pub struct SplitIterator<'a> { + expander: StringExpander<'a>, + words: Vec>, +} + +impl<'a> SplitIterator<'a> { + pub fn new(s: &'a NativeIntStr) -> Self { + Self { + expander: StringExpander::new(s), + words: Vec::new(), + } + } + + fn skip_one(&mut self) -> Result<(), ParseError> { + self.expander + .get_parser_mut() + .consume_one_ascii_or_all_non_ascii()?; + Ok(()) + } + + fn take_one(&mut self) -> Result<(), ParseError> { + Ok(self.expander.take_one()?) + } + + fn get_current_char(&self) -> Option { + self.expander.peek().ok() + } + + fn push_char_to_word(&mut self, c: char) { + self.expander.put_one_char(c); + } + + fn push_word_to_words(&mut self) { + let word = self.expander.take_collected_output(); + self.words.push(word); + } + + fn get_parser(&self) -> &StringParser<'a> { + self.expander.get_parser() + } + + fn get_parser_mut(&mut self) -> &mut StringParser<'a> { + self.expander.get_parser_mut() + } + + fn substitute_variable<'x>(&'x mut self) -> Result<(), ParseError> { + let mut var_parse = VariableParser::<'a, '_> { + parser: self.get_parser_mut(), + }; + + let (name, default) = var_parse.parse_variable()?; + + let varname_os_str_cow = from_native_int_representation(Cow::Borrowed(name)); + let value = std::env::var_os(varname_os_str_cow); + match (&value, default) { + (None, None) => {} // do nothing, just replace it with "" + (Some(value), _) => { + self.expander.put_string(value); + } + (None, Some(default)) => { + self.expander.put_native_string(default); + } + }; + + Ok(()) + } + + fn check_and_replace_ascii_escape_code(&mut self, c: char) -> Result { + if let Some(replace) = REPLACEMENTS.iter().find(|&x| x.0 == c) { + self.skip_one()?; + self.push_char_to_word(replace.1); + return Ok(true); + } + + Ok(false) + } + + fn make_invalid_sequence_backslash_xin_minus_s(&self, c: char) -> ParseError { + ParseError::InvalidSequenceBackslashXInMinusS { + pos: self.expander.get_parser().get_peek_position(), + c, + } + } + + fn state_root(&mut self) -> Result<(), ParseError> { + loop { + match self.state_delimiter() { + Err(ParseError::ContinueWithDelimiter) => {} + Err(ParseError::ReachedEnd) => return Ok(()), + result => return result, + } + } + } + + fn state_delimiter(&mut self) -> Result<(), ParseError> { + loop { + match self.get_current_char() { + None => return Ok(()), + Some('#') => { + self.skip_one()?; + self.state_comment()?; + } + Some(BACKSLASH) => { + self.skip_one()?; + self.state_delimiter_backslash()?; + } + Some(c) if ASCII_WHITESPACE_CHARS.contains(&c) => { + self.skip_one()?; + } + Some(_) => { + // Don't consume char. Will be done in unquoted state. + self.state_unquoted()?; + } + } + } + } + + fn state_delimiter_backslash(&mut self) -> Result<(), ParseError> { + match self.get_current_char() { + None => Err(ParseError::InvalidBackslashAtEndOfStringInMinusS { + pos: self.get_parser().get_peek_position(), + quoting: "Delimiter".into(), + }), + Some('_') | Some(NEW_LINE) => { + self.skip_one()?; + Ok(()) + } + Some(DOLLAR) | Some(BACKSLASH) | Some('#') | Some(SINGLE_QUOTES) + | Some(DOUBLE_QUOTES) => { + self.take_one()?; + self.state_unquoted() + } + Some('c') => Err(ParseError::ReachedEnd), + Some(c) if self.check_and_replace_ascii_escape_code(c)? => self.state_unquoted(), + Some(c) => Err(self.make_invalid_sequence_backslash_xin_minus_s(c)), + } + } + + fn state_unquoted(&mut self) -> Result<(), ParseError> { + loop { + match self.get_current_char() { + None => { + self.push_word_to_words(); + return Err(ParseError::ReachedEnd); + } + Some(DOLLAR) => { + self.substitute_variable()?; + } + Some(SINGLE_QUOTES) => { + self.skip_one()?; + self.state_single_quoted()?; + } + Some(DOUBLE_QUOTES) => { + self.skip_one()?; + self.state_double_quoted()?; + } + Some(BACKSLASH) => { + self.skip_one()?; + self.state_unquoted_backslash()?; + } + Some(c) if ASCII_WHITESPACE_CHARS.contains(&c) => { + self.push_word_to_words(); + self.skip_one()?; + return Ok(()); + } + Some(_) => { + self.take_one()?; + } + } + } + } + + fn state_unquoted_backslash(&mut self) -> Result<(), ParseError> { + match self.get_current_char() { + None => Err(ParseError::InvalidBackslashAtEndOfStringInMinusS { + pos: self.get_parser().get_peek_position(), + quoting: "Unquoted".into(), + }), + Some(NEW_LINE) => { + self.skip_one()?; + Ok(()) + } + Some('_') => { + self.skip_one()?; + self.push_word_to_words(); + Err(ParseError::ContinueWithDelimiter) + } + Some('c') => { + self.push_word_to_words(); + Err(ParseError::ReachedEnd) + } + Some(DOLLAR) | Some(BACKSLASH) | Some(SINGLE_QUOTES) | Some(DOUBLE_QUOTES) => { + self.take_one()?; + Ok(()) + } + Some(c) if self.check_and_replace_ascii_escape_code(c)? => Ok(()), + Some(c) => Err(self.make_invalid_sequence_backslash_xin_minus_s(c)), + } + } + + fn state_single_quoted(&mut self) -> Result<(), ParseError> { + loop { + match self.get_current_char() { + None => { + return Err(ParseError::MissingClosingQuote { + pos: self.get_parser().get_peek_position(), + c: '\'', + }) + } + Some(SINGLE_QUOTES) => { + self.skip_one()?; + return Ok(()); + } + Some(BACKSLASH) => { + self.skip_one()?; + self.split_single_quoted_backslash()?; + } + Some(_) => { + self.take_one()?; + } + } + } + } + + fn split_single_quoted_backslash(&mut self) -> Result<(), ParseError> { + match self.get_current_char() { + None => Err(ParseError::MissingClosingQuote { + pos: self.get_parser().get_peek_position(), + c: '\'', + }), + Some(NEW_LINE) => { + self.skip_one()?; + Ok(()) + } + Some(SINGLE_QUOTES) | Some(BACKSLASH) => { + self.take_one()?; + Ok(()) + } + Some(c) if REPLACEMENTS.iter().any(|&x| x.0 == c) => { + // See GNU test-suite e11: In single quotes, \t remains as it is. + // Comparing with GNU behavior: \a is not accepted and issues an error. + // So apparently only known sequences are allowed, even though they are not expanded.... bug of GNU? + self.push_char_to_word(BACKSLASH); + self.take_one()?; + Ok(()) + } + Some(c) => Err(self.make_invalid_sequence_backslash_xin_minus_s(c)), + } + } + + fn state_double_quoted(&mut self) -> Result<(), ParseError> { + loop { + match self.get_current_char() { + None => { + return Err(ParseError::MissingClosingQuote { + pos: self.get_parser().get_peek_position(), + c: '"', + }) + } + Some(DOLLAR) => { + self.substitute_variable()?; + } + Some(DOUBLE_QUOTES) => { + self.skip_one()?; + return Ok(()); + } + Some(BACKSLASH) => { + self.skip_one()?; + self.state_double_quoted_backslash()?; + } + Some(_) => { + self.take_one()?; + } + } + } + } + + fn state_double_quoted_backslash(&mut self) -> Result<(), ParseError> { + match self.get_current_char() { + None => Err(ParseError::MissingClosingQuote { + pos: self.get_parser().get_peek_position(), + c: '"', + }), + Some(NEW_LINE) => { + self.skip_one()?; + Ok(()) + } + Some(DOUBLE_QUOTES) | Some(DOLLAR) | Some(BACKSLASH) => { + self.take_one()?; + Ok(()) + } + Some('c') => Err(ParseError::BackslashCNotAllowedInDoubleQuotes { + pos: self.get_parser().get_peek_position(), + }), + Some(c) if self.check_and_replace_ascii_escape_code(c)? => Ok(()), + Some(c) => Err(self.make_invalid_sequence_backslash_xin_minus_s(c)), + } + } + + fn state_comment(&mut self) -> Result<(), ParseError> { + loop { + match self.get_current_char() { + None => return Err(ParseError::ReachedEnd), + Some(NEW_LINE) => { + self.skip_one()?; + return Ok(()); + } + Some(_) => { + self.get_parser_mut().skip_until_char_or_end(NEW_LINE); + } + } + } + } + + pub fn split(mut self) -> Result, ParseError> { + self.state_root()?; + Ok(self.words) + } +} + +pub fn split(s: &NativeIntStr) -> Result, ParseError> { + let splitted_args = SplitIterator::new(s).split()?; + Ok(splitted_args) +} diff --git a/src/uu/env/src/string_expander.rs b/src/uu/env/src/string_expander.rs new file mode 100644 index 00000000000..06e4699269f --- /dev/null +++ b/src/uu/env/src/string_expander.rs @@ -0,0 +1,92 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::{ + ffi::{OsStr, OsString}, + mem, + ops::Deref, +}; + +use crate::{ + native_int_str::{to_native_int_representation, NativeCharInt, NativeIntStr}, + string_parser::{Chunk, Error, StringParser}, +}; + +/// This class makes parsing and word collection more convenient. +/// +/// It manages an "output" buffer that is automatically filled. +/// It provides "skip_one" and "take_one" that focus on +/// working with ASCII separators. Thus they will skip or take +/// all consecutive non-ascii char sequences at once. +pub struct StringExpander<'a> { + parser: StringParser<'a>, + output: Vec, +} + +impl<'a> StringExpander<'a> { + pub fn new(input: &'a NativeIntStr) -> Self { + Self { + parser: StringParser::new(input), + output: Vec::default(), + } + } + + pub fn new_at(input: &'a NativeIntStr, pos: usize) -> Self { + Self { + parser: StringParser::new_at(input, pos), + output: Vec::default(), + } + } + + pub fn get_parser(&self) -> &StringParser<'a> { + &self.parser + } + + pub fn get_parser_mut(&mut self) -> &mut StringParser<'a> { + &mut self.parser + } + + pub fn peek(&self) -> Result { + self.parser.peek() + } + + pub fn skip_one(&mut self) -> Result<(), Error> { + self.get_parser_mut().consume_one_ascii_or_all_non_ascii()?; + Ok(()) + } + + pub fn get_peek_position(&self) -> usize { + self.get_parser().get_peek_position() + } + + pub fn take_one(&mut self) -> Result<(), Error> { + let chunks = self.parser.consume_one_ascii_or_all_non_ascii()?; + for chunk in chunks { + match chunk { + Chunk::InvalidEncoding(invalid) => self.output.extend(invalid), + Chunk::ValidSingleIntChar((_c, ni)) => self.output.push(ni), + } + } + Ok(()) + } + + pub fn put_one_char(&mut self, c: char) { + let os_str = OsString::from(c.to_string()); + self.put_string(os_str); + } + + pub fn put_string>(&mut self, os_str: S) { + let native = to_native_int_representation(os_str.as_ref()); + self.output.extend(native.deref()); + } + + pub fn put_native_string(&mut self, n_str: &NativeIntStr) { + self.output.extend(n_str); + } + + pub fn take_collected_output(&mut self) -> Vec { + mem::take(&mut self.output) + } +} diff --git a/src/uu/env/src/string_parser.rs b/src/uu/env/src/string_parser.rs new file mode 100644 index 00000000000..6f8d550883b --- /dev/null +++ b/src/uu/env/src/string_parser.rs @@ -0,0 +1,182 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. +// +// spell-checker:ignore (words) splitted FFFD +#![forbid(unsafe_code)] + +use std::{borrow::Cow, ffi::OsStr}; + +use crate::native_int_str::{ + from_native_int_representation, get_char_from_native_int, get_single_native_int_value, + NativeCharInt, NativeIntStr, +}; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + pub peek_position: usize, + pub err_type: ErrorType, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorType { + EndOfInput, + InternalError, +} + +/// Provides a valid char or a invalid sequence of bytes. +/// +/// Invalid byte sequences can't be splitted in any meaningful way. +/// Thus, they need to be consumed as one piece. +pub enum Chunk<'a> { + InvalidEncoding(&'a NativeIntStr), + ValidSingleIntChar((char, NativeCharInt)), +} + +/// This class makes parsing a OsString char by char more convenient. +/// +/// It also allows to capturing of intermediate positions for later splitting. +pub struct StringParser<'a> { + input: &'a NativeIntStr, + pointer: usize, + remaining: &'a NativeIntStr, +} + +impl<'a> StringParser<'a> { + pub fn new(input: &'a NativeIntStr) -> Self { + let mut instance = Self { + input, + pointer: 0, + remaining: input, + }; + instance.set_pointer(0); + instance + } + + pub fn new_at(input: &'a NativeIntStr, pos: usize) -> Self { + let mut instance = Self::new(input); + instance.set_pointer(pos); + instance + } + + pub fn get_input(&self) -> &'a NativeIntStr { + self.input + } + + pub fn get_peek_position(&self) -> usize { + self.pointer + } + + pub fn peek(&self) -> Result { + self.peek_char_at_pointer(self.pointer) + } + + fn make_err(&self, err_type: ErrorType) -> Error { + Error { + peek_position: self.get_peek_position(), + err_type, + } + } + + pub fn peek_char_at_pointer(&self, at_pointer: usize) -> Result { + let split = self.input.split_at(at_pointer).1; + if split.is_empty() { + return Err(self.make_err(ErrorType::EndOfInput)); + } + if let Some((c, _ni)) = get_char_from_native_int(split[0]) { + Ok(c) + } else { + Ok('\u{FFFD}') + } + } + + fn get_chunk_with_length_at(&self, pointer: usize) -> Result<(Chunk<'a>, usize), Error> { + let (_before, after) = self.input.split_at(pointer); + if after.is_empty() { + return Err(self.make_err(ErrorType::EndOfInput)); + } + + if let Some(c_ni) = get_char_from_native_int(after[0]) { + Ok((Chunk::ValidSingleIntChar(c_ni), 1)) + } else { + let mut i = 1; + while i < after.len() { + if let Some(_c) = get_char_from_native_int(after[i]) { + break; + } + i += 1; + } + + let chunk = &after[0..i]; + Ok((Chunk::InvalidEncoding(chunk), chunk.len())) + } + } + + pub fn peek_chunk(&self) -> Option> { + return self + .get_chunk_with_length_at(self.pointer) + .ok() + .map(|(chunk, _)| chunk); + } + + pub fn consume_chunk(&mut self) -> Result, Error> { + let (chunk, len) = self.get_chunk_with_length_at(self.pointer)?; + self.set_pointer(self.pointer + len); + Ok(chunk) + } + + pub fn consume_one_ascii_or_all_non_ascii(&mut self) -> Result>, Error> { + let mut result = Vec::>::new(); + loop { + let data = self.consume_chunk()?; + let was_ascii = if let Chunk::ValidSingleIntChar((c, _ni)) = &data { + c.is_ascii() + } else { + false + }; + result.push(data); + if was_ascii { + return Ok(result); + } + + match self.peek_chunk() { + Some(Chunk::ValidSingleIntChar((c, _ni))) if c.is_ascii() => return Ok(result), + None => return Ok(result), + _ => {} + } + } + } + + pub fn skip_multiple(&mut self, skip_byte_count: usize) { + let end_ptr = self.pointer + skip_byte_count; + self.set_pointer(end_ptr); + } + + pub fn skip_until_char_or_end(&mut self, c: char) { + let native_rep = get_single_native_int_value(&c).unwrap(); + let pos = self.remaining.iter().position(|x| *x == native_rep); + + if let Some(pos) = pos { + self.set_pointer(self.pointer + pos); + } else { + self.set_pointer(self.input.len()); + } + } + + pub fn substring(&self, range: &std::ops::Range) -> &'a NativeIntStr { + let (_before1, after1) = self.input.split_at(range.start); + let (middle, _after2) = after1.split_at(range.end - range.start); + middle + } + + pub fn peek_remaining(&self) -> Cow<'a, OsStr> { + from_native_int_representation(Cow::Borrowed(self.remaining)) + } + + pub fn set_pointer(&mut self, new_pointer: usize) { + self.pointer = new_pointer; + let (_before, after) = self.input.split_at(self.pointer); + self.remaining = after; + } +} diff --git a/src/uu/env/src/variable_parser.rs b/src/uu/env/src/variable_parser.rs new file mode 100644 index 00000000000..ef80ff801a6 --- /dev/null +++ b/src/uu/env/src/variable_parser.rs @@ -0,0 +1,158 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::ops::Range; + +use crate::{native_int_str::NativeIntStr, parse_error::ParseError, string_parser::StringParser}; + +pub struct VariableParser<'a, 'b> { + pub parser: &'b mut StringParser<'a>, +} + +impl<'a, 'b> VariableParser<'a, 'b> { + fn get_current_char(&self) -> Option { + self.parser.peek().ok() + } + + fn check_variable_name_start(&self) -> Result<(), ParseError> { + if let Some(c) = self.get_current_char() { + if c.is_ascii_digit() { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: self.parser.get_peek_position(), + msg: format!("Unexpected character: '{}', expected variable name must not start with 0..9", c) }); + } + } + Ok(()) + } + + fn skip_one(&mut self) -> Result<(), ParseError> { + self.parser.consume_chunk()?; + Ok(()) + } + + fn parse_braced_variable_name( + &mut self, + ) -> Result<(&'a NativeIntStr, Option<&'a NativeIntStr>), ParseError> { + let pos_start = self.parser.get_peek_position(); + + self.check_variable_name_start()?; + + let (varname_end, default_end); + loop { + match self.get_current_char() { + None => { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: self.parser.get_peek_position(), msg: "Missing closing brace".into() }) + }, + Some(c) if !c.is_ascii() || c.is_ascii_alphanumeric() || c == '_' => { + self.skip_one()?; + } + Some(':') => { + varname_end = self.parser.get_peek_position(); + loop { + match self.get_current_char() { + None => { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: self.parser.get_peek_position(), + msg: "Missing closing brace after default value".into() }) + }, + Some('}') => { + default_end = Some(self.parser.get_peek_position()); + self.skip_one()?; + break + }, + Some(_) => { + self.skip_one()?; + }, + } + } + break; + }, + Some('}') => { + varname_end = self.parser.get_peek_position(); + default_end = None; + self.skip_one()?; + break; + }, + Some(c) => { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: self.parser.get_peek_position(), + msg: format!("Unexpected character: '{}', expected a closing brace ('}}') or colon (':')", c) + }) + }, + }; + } + + let default_opt = if let Some(default_end) = default_end { + Some(self.parser.substring(&Range { + start: varname_end + 1, + end: default_end, + })) + } else { + None + }; + + let varname = self.parser.substring(&Range { + start: pos_start, + end: varname_end, + }); + + Ok((varname, default_opt)) + } + + fn parse_unbraced_variable_name(&mut self) -> Result<&'a NativeIntStr, ParseError> { + let pos_start = self.parser.get_peek_position(); + + self.check_variable_name_start()?; + + loop { + match self.get_current_char() { + None => break, + Some(c) if c.is_ascii_alphanumeric() || c == '_' => { + self.skip_one()?; + } + Some(_) => break, + }; + } + + let pos_end = self.parser.get_peek_position(); + + if pos_end == pos_start { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: pos_start, + msg: "Missing variable name".into(), + }); + } + + let varname = self.parser.substring(&Range { + start: pos_start, + end: pos_end, + }); + + Ok(varname) + } + + pub fn parse_variable( + &mut self, + ) -> Result<(&'a NativeIntStr, Option<&'a NativeIntStr>), ParseError> { + self.skip_one()?; + + let (name, default) = match self.get_current_char() { + None => { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: self.parser.get_peek_position(), + msg: "missing variable name".into(), + }) + } + Some('{') => { + self.skip_one()?; + self.parse_braced_variable_name()? + } + Some(_) => (self.parse_unbraced_variable_name()?, None), + }; + + Ok((name, default)) + } +} diff --git a/tests/by-util/test_env.rs b/tests/by-util/test_env.rs index 8ce55a1d3a2..28ba97ad66a 100644 --- a/tests/by-util/test_env.rs +++ b/tests/by-util/test_env.rs @@ -2,9 +2,12 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (words) bamf chdir rlimit prlimit COMSPEC +// spell-checker:ignore (words) bamf chdir rlimit prlimit COMSPEC cout cerr FFFD +#[cfg(target_os = "linux")] +use crate::common::util::expected_result; use crate::common::util::TestScenario; +use ::env::native_int_str::{Convert, NCvt}; use std::env; use std::path::Path; use tempfile::tempdir; @@ -34,11 +37,24 @@ fn test_env_version() { #[test] fn test_echo() { - let result = new_ucmd!().arg("echo").arg("FOO-bar").succeeds(); + #[cfg(target_os = "windows")] + let args = ["cmd", "/d/c", "echo"]; + #[cfg(not(target_os = "windows"))] + let args = ["echo"]; + + let result = new_ucmd!().args(&args).arg("FOO-bar").succeeds(); assert_eq!(result.stdout_str().trim(), "FOO-bar"); } +#[cfg(target_os = "windows")] +#[test] +fn test_if_windows_batch_files_can_be_executed() { + let result = new_ucmd!().arg("./runBat.bat").succeeds(); + + assert!(result.stdout_str().contains("Hello Windows World!")); +} + #[test] fn test_file_option() { let out = new_ucmd!() @@ -245,3 +261,935 @@ fn test_fail_change_directory() { .stderr_move_str(); assert!(out.contains("env: cannot change directory to ")); } + +#[cfg(not(target_os = "windows"))] // windows has no executable "echo", its only supported as part of a batch-file +#[test] +fn test_split_string_into_args_one_argument_no_quotes() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .arg("-S echo hello world") + .succeeds() + .stdout_move_str(); + assert_eq!(out, "hello world\n"); +} + +#[cfg(not(target_os = "windows"))] // windows has no executable "echo", its only supported as part of a batch-file +#[test] +fn test_split_string_into_args_one_argument() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .arg("-S echo \"hello world\"") + .succeeds() + .stdout_move_str(); + assert_eq!(out, "hello world\n"); +} + +#[cfg(not(target_os = "windows"))] // windows has no executable "echo", its only supported as part of a batch-file +#[test] +fn test_split_string_into_args_s_escaping_challenge() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .args(&[r#"-S echo "hello \"great\" world""#]) + .succeeds() + .stdout_move_str(); + assert_eq!(out, "hello \"great\" world\n"); +} + +#[test] +fn test_split_string_into_args_s_escaped_c_not_allowed() { + let scene = TestScenario::new(util_name!()); + + let out = scene.ucmd().args(&[r#"-S"\c""#]).fails().stderr_move_str(); + assert_eq!( + out, + "env: '\\c' must not appear in double-quoted -S string\n" + ); +} + +#[cfg(not(target_os = "windows"))] // no printf available +#[test] +fn test_split_string_into_args_s_whitespace_handling() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .args(&["-Sprintf x%sx\\n A \t B \x0B\x0C\r\n"]) + .succeeds() + .stdout_move_str(); + assert_eq!(out, "xAx\nxBx\n"); +} + +#[cfg(not(target_os = "windows"))] // no printf available +#[test] +fn test_split_string_into_args_long_option_whitespace_handling() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .args(&["--split-string printf x%sx\\n A \t B \x0B\x0C\r\n"]) + .succeeds() + .stdout_move_str(); + assert_eq!(out, "xAx\nxBx\n"); +} + +#[cfg(not(target_os = "windows"))] // no printf available +#[test] +fn test_split_string_into_args_debug_output_whitespace_handling() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .args(&["-vS printf x%sx\\n A \t B \x0B\x0C\r\n"]) + .succeeds(); + assert_eq!(out.stdout_str(), "xAx\nxBx\n"); + assert_eq!(out.stderr_str(), "input args:\narg[0]: 'env'\narg[1]: $'-vS printf x%sx\\\\n A \\t B \\x0B\\x0C\\r\\n'\nexecutable: 'printf'\narg[0]: $'x%sx\\n'\narg[1]: 'A'\narg[2]: 'B'\n"); +} + +// FixMe: This test fails on MACOS: +// thread 'test_env::test_gnu_e20' panicked at 'assertion failed: `(left == right)` +// left: `"A=B C=D\n__CF_USER_TEXT_ENCODING=0x1F5:0x0:0x0\n"`, +// right: `"A=B C=D\n"`', tests/by-util/test_env.rs:369:5 +#[cfg(not(target_os = "macos"))] +#[test] +fn test_gnu_e20() { + let scene = TestScenario::new(util_name!()); + + let env_bin = String::from(crate::common::util::TESTS_BINARY) + " " + util_name!(); + + let (input, output) = ( + [ + String::from("-i"), + String::from(r#"-SA="B\_C=D" "#) + env_bin.escape_default().to_string().as_str() + "", + ], + "A=B C=D\n", + ); + + let out = scene.ucmd().args(&input).succeeds(); + assert_eq!(out.stdout_str(), output); +} + +#[test] +fn test_split_string_misc() { + use ::env::native_int_str::NCvt; + use ::env::parse_args_from_str; + + assert_eq!( + NCvt::convert(vec!["A=B", "FOO=AR", "sh", "-c", "echo $A$FOO"]), + parse_args_from_str(&NCvt::convert(r#"A=B FOO=AR sh -c "echo \$A\$FOO""#)).unwrap(), + ); + assert_eq!( + NCvt::convert(vec!["A=B", "FOO=AR", "sh", "-c", "echo $A$FOO"]), + parse_args_from_str(&NCvt::convert(r#"A=B FOO=AR sh -c 'echo $A$FOO'"#)).unwrap() + ); + assert_eq!( + NCvt::convert(vec!["A=B", "FOO=AR", "sh", "-c", "echo $A$FOO"]), + parse_args_from_str(&NCvt::convert(r#"A=B FOO=AR sh -c 'echo $A$FOO'"#)).unwrap() + ); + + assert_eq!( + NCvt::convert(vec!["-i", "A=B ' C"]), + parse_args_from_str(&NCvt::convert(r#"-i A='B \' C'"#)).unwrap() + ); +} + +#[test] +fn test_split_string_environment_vars_test() { + std::env::set_var("FOO", "BAR"); + assert_eq!( + NCvt::convert(vec!["FOO=bar", "sh", "-c", "echo xBARx =$FOO="]), + ::env::parse_args_from_str(&NCvt::convert(r#"FOO=bar sh -c "echo x${FOO}x =\$FOO=""#)) + .unwrap(), + ); +} + +#[macro_export] +macro_rules! compare_with_gnu { + ( $ts:expr, $args:expr ) => {{ + println!("=========================================================================="); + let result = $ts.ucmd().args($args).run(); + + #[cfg(target_os = "linux")] + { + let reference = expected_result(&$ts, $args); + if let Ok(reference) = reference { + let success = result.code() == reference.code() + && result.stdout_str() == reference.stdout_str() + && result.stderr_str() == reference.stderr_str(); + if !success { + println!("reference.code: {}", reference.code()); + println!(" result.code: {}", result.code()); + println!("reference.cout: {}", reference.stdout_str()); + println!(" result.cout: {}", result.stdout_str()); + println!("reference.cerr: {}", reference.stderr_str_lossy()); + println!(" result.cerr: {}", result.stderr_str_lossy()); + } + assert_eq!(result.code(), reference.code()); + assert_eq!(result.stdout_str(), reference.stdout_str()); + assert_eq!(result.stderr_str(), reference.stderr_str()); + } else { + println!( + "gnu reference test skipped. Reason: {:?}", + reference.unwrap_err() + ); + } + } + + result + }}; +} + +#[test] +fn test_env_with_gnu_reference_parsing_errors() { + let ts = TestScenario::new(util_name!()); + + compare_with_gnu!(ts, &["-S\\|echo hallo"]) // no quotes, invalid escape sequence | + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\|' in -S\n"); + + compare_with_gnu!(ts, &["-S\\a"]) // no quotes, invalid escape sequence a + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\a' in -S\n"); + + compare_with_gnu!(ts, &["-S\"\\a\""]) // double quotes, invalid escape sequence a + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\a' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S"\a""#]) // same as before, just using r#""# + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\a' in -S\n"); + + compare_with_gnu!(ts, &["-S'\\a'"]) // single quotes, invalid escape sequence a + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\a' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S\|\&\;"#]) // no quotes, invalid escape sequence | + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\|' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S\<\&\;"#]) // no quotes, invalid escape sequence < + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\<' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S\>\&\;"#]) // no quotes, invalid escape sequence > + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\>' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S\`\&\;"#]) // no quotes, invalid escape sequence ` + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S"\`\&\;""#]) // double quotes, invalid escape sequence ` + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S'\`\&\;'"#]) // single quotes, invalid escape sequence ` + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S\`"#]) // ` escaped without quotes + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S"\`""#]) // ` escaped in double quotes + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S'\`'"#]) // ` escaped in single quotes + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + ts.ucmd() + .args(&[r#"-S\🦉"#]) // ` escaped in single quotes + .fails() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\\u{FFFD}' in -S\n"); // gnu doesn't show the owl. Instead a invalid unicode ? +} + +#[test] +fn test_env_with_gnu_reference_empty_executable_single_quotes() { + let ts = TestScenario::new(util_name!()); + + ts.ucmd() + .args(&["-S''"]) // empty single quotes, considered as program name + .fails() + .code_is(127) + .no_stdout() + .stderr_is("env: '': No such file or directory\n"); // gnu version again adds escaping here +} + +#[test] +fn test_env_with_gnu_reference_empty_executable_double_quotes() { + let ts = TestScenario::new(util_name!()); + + compare_with_gnu!(ts, &["-S\"\""]) // empty double quotes, considered as program name + .failure() + .code_is(127) + .no_stdout() + .stderr_is("env: '': No such file or directory\n"); +} + +#[cfg(test)] +mod tests_split_iterator { + + enum EscapeStyle { + /// No escaping. + None, + /// Wrap in single quotes. + SingleQuoted, + /// Single quotes combined with backslash. + Mixed, + } + + /// Determines escaping style to use. + fn escape_style(s: &str) -> EscapeStyle { + if s.is_empty() { + return EscapeStyle::SingleQuoted; + } + + let mut special = false; + let mut newline = false; + let mut single_quote = false; + + for c in s.chars() { + match c { + '\n' => { + newline = true; + special = true; + } + '\'' => { + single_quote = true; + special = true; + } + '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | ' ' | '\t' + | '*' | '?' | '[' | '#' | '˜' | '=' | '%' => { + special = true; + } + _ => continue, + } + } + + if !special { + EscapeStyle::None + } else if newline && !single_quote { + EscapeStyle::SingleQuoted + } else { + EscapeStyle::Mixed + } + } + + /// Escapes special characters in a string, so that it will retain its literal + /// meaning when used as a part of command in Unix shell. + /// + /// It tries to avoid introducing any unnecessary quotes or escape characters, + /// but specifics regarding quoting style are left unspecified. + pub fn quote(s: &str) -> std::borrow::Cow { + // We are going somewhat out of the way to provide + // minimal amount of quoting in typical cases. + match escape_style(s) { + EscapeStyle::None => s.into(), + EscapeStyle::SingleQuoted => format!("'{}'", s).into(), + EscapeStyle::Mixed => { + let mut quoted = String::new(); + quoted.push('\''); + for c in s.chars() { + if c == '\'' { + quoted.push_str("'\\''"); + } else { + quoted.push(c); + } + } + quoted.push('\''); + quoted.into() + } + } + } + + /// Joins arguments into a single command line suitable for execution in Unix + /// shell. + /// + /// Each argument is quoted using [`quote`] to preserve its literal meaning when + /// parsed by Unix shell. + /// + /// Note: This function is essentially an inverse of [`split`]. + /// + /// # Examples + /// + /// Logging executed commands in format that can be easily copied and pasted + /// into an actual shell: + /// + /// ```rust,no_run + /// fn execute(args: &[&str]) { + /// use std::process::Command; + /// println!("Executing: {}", shell_words::join(args)); + /// Command::new(&args[0]) + /// .args(&args[1..]) + /// .spawn() + /// .expect("failed to start subprocess") + /// .wait() + /// .expect("failed to wait for subprocess"); + /// } + /// + /// execute(&["python", "-c", "print('Hello world!')"]); + /// ``` + /// + /// [`quote`]: fn.quote.html + /// [`split`]: fn.split.html + pub fn join(words: I) -> String + where + I: IntoIterator, + S: AsRef, + { + let mut line = words.into_iter().fold(String::new(), |mut line, word| { + let quoted = quote(word.as_ref()); + line.push_str(quoted.as_ref()); + line.push(' '); + line + }); + line.pop(); + line + } + + use std::ffi::OsString; + + use ::env::parse_error::ParseError; + use env::native_int_str::{from_native_int_representation_owned, Convert, NCvt}; + + fn split(input: &str) -> Result, ParseError> { + ::env::split_iterator::split(&NCvt::convert(input)).map(|vec| { + vec.into_iter() + .map(from_native_int_representation_owned) + .collect() + }) + } + + fn split_ok(cases: &[(&str, &[&str])]) { + for (i, &(input, expected)) in cases.iter().enumerate() { + match split(input) { + Err(actual) => { + panic!( + "[{i}] calling split({:?}):\nexpected: Ok({:?})\n actual: Err({:?})\n", + input, expected, actual + ); + } + Ok(actual) => { + assert!( + expected == actual.as_slice(), + "[{i}] After split({:?}).unwrap()\nexpected: {:?}\n actual: {:?}\n", + input, + expected, + actual + ); + } + } + } + } + + #[test] + fn split_empty() { + split_ok(&[("", &[])]); + } + + #[test] + fn split_initial_whitespace_is_removed() { + split_ok(&[ + (" a", &["a"]), + ("\t\t\t\tbar", &["bar"]), + ("\t \nc", &["c"]), + ]); + } + + #[test] + fn split_trailing_whitespace_is_removed() { + split_ok(&[ + ("a ", &["a"]), + ("b\t", &["b"]), + ("c\t \n \n \n", &["c"]), + ("d\n\n", &["d"]), + ]); + } + + #[test] + fn split_carriage_return() { + split_ok(&[("c\ra\r'\r'\r", &["c", "a", "\r"])]); + } + + #[test] + fn split_() { + split_ok(&[("\\'\\'", &["''"])]); + } + + #[test] + fn split_single_quotes() { + split_ok(&[ + (r#"''"#, &[r#""#]), + (r#"'a'"#, &[r#"a"#]), + (r#"'\\'"#, &[r#"\"#]), + (r#"' \\ '"#, &[r#" \ "#]), + (r#"'#'"#, &[r#"#"#]), + ]); + } + + #[test] + fn split_double_quotes() { + split_ok(&[ + (r#""""#, &[""]), + (r#""""""#, &[""]), + (r#""a b c' d""#, &["a b c' d"]), + (r#""\$""#, &["$"]), + (r#""`""#, &["`"]), + (r#""\"""#, &["\""]), + (r#""\\""#, &["\\"]), + ("\"\n\"", &["\n"]), + ("\"\\\n\"", &[""]), + ]); + } + + #[test] + fn split_unquoted() { + split_ok(&[ + (r#"\\|\\&\\;"#, &[r#"\|\&\;"#]), + (r#"\\<\\>"#, &[r#"\<\>"#]), + (r#"\\(\\)"#, &[r#"\(\)"#]), + (r#"\$"#, &[r#"$"#]), + (r#"\""#, &[r#"""#]), + (r#"\'"#, &[r#"'"#]), + ("\\\n", &[]), + (" \\\n \n", &[]), + ("a\nb\nc", &["a", "b", "c"]), + ("a\\\nb\\\nc", &["abc"]), + ("foo bar baz", &["foo", "bar", "baz"]), + ]); + } + + #[test] + fn split_trailing_backslash() { + assert_eq!( + split("\\"), + Err(ParseError::InvalidBackslashAtEndOfStringInMinusS { + pos: 1, + quoting: "Delimiter".into() + }) + ); + assert_eq!( + split(" \\"), + Err(ParseError::InvalidBackslashAtEndOfStringInMinusS { + pos: 2, + quoting: "Delimiter".into() + }) + ); + assert_eq!( + split("a\\"), + Err(ParseError::InvalidBackslashAtEndOfStringInMinusS { + pos: 2, + quoting: "Unquoted".into() + }) + ); + } + + #[test] + fn split_errors() { + assert_eq!( + split("'abc"), + Err(ParseError::MissingClosingQuote { pos: 4, c: '\'' }) + ); + assert_eq!( + split("\""), + Err(ParseError::MissingClosingQuote { pos: 1, c: '"' }) + ); + assert_eq!( + split("'\\"), + Err(ParseError::MissingClosingQuote { pos: 2, c: '\'' }) + ); + assert_eq!( + split("'\\"), + Err(ParseError::MissingClosingQuote { pos: 2, c: '\'' }) + ); + assert_eq!( + split(r#""$""#), + Err(ParseError::ParsingOfVariableNameFailed { + pos: 2, + msg: "Missing variable name".into() + }), + ); + } + + #[test] + fn split_error_fail_with_unknown_escape_sequences() { + assert_eq!( + split("\\a"), + Err(ParseError::InvalidSequenceBackslashXInMinusS { pos: 1, c: 'a' }) + ); + assert_eq!( + split("\"\\a\""), + Err(ParseError::InvalidSequenceBackslashXInMinusS { pos: 2, c: 'a' }) + ); + assert_eq!( + split("'\\a'"), + Err(ParseError::InvalidSequenceBackslashXInMinusS { pos: 2, c: 'a' }) + ); + assert_eq!( + split(r#""\a""#), + Err(ParseError::InvalidSequenceBackslashXInMinusS { pos: 2, c: 'a' }) + ); + assert_eq!( + split(r#"\🦉"#), + Err(ParseError::InvalidSequenceBackslashXInMinusS { + pos: 1, + c: '\u{FFFD}' + }) + ); + } + + #[test] + fn split_comments() { + split_ok(&[ + (r#" x # comment "#, &["x"]), + (r#" w1#w2 "#, &["w1#w2"]), + (r#"'not really a # comment'"#, &["not really a # comment"]), + (" a # very long comment \n b # another comment", &["a", "b"]), + ]); + } + + #[test] + fn test_quote() { + assert_eq!(quote(""), "''"); + assert_eq!(quote("'"), "''\\'''"); + assert_eq!(quote("abc"), "abc"); + assert_eq!(quote("a \n b"), "'a \n b'"); + assert_eq!(quote("X'\nY"), "'X'\\''\nY'"); + } + + #[test] + fn test_join() { + assert_eq!(join(["a", "b", "c"]), "a b c"); + assert_eq!(join([" ", "$", "\n"]), "' ' '$' '\n'"); + } + + #[test] + fn join_followed_by_split_is_identity() { + let cases: Vec<&[&str]> = vec![ + &["a"], + &["python", "-c", "print('Hello world!')"], + &["echo", " arg with spaces ", "arg \' with \" quotes"], + &["even newlines are quoted correctly\n", "\n", "\n\n\t "], + &["$", "`test`"], + &["cat", "~user/log*"], + &["test", "'a \"b", "\"X'"], + &["empty", "", "", ""], + ]; + for argv in cases { + let args = join(argv); + assert_eq!(split(&args).unwrap(), argv); + } + } +} + +mod test_raw_string_parser { + use std::{ + borrow::Cow, + ffi::{OsStr, OsString}, + }; + + use env::{ + native_int_str::{ + from_native_int_representation, from_native_int_representation_owned, + to_native_int_representation, NativeStr, + }, + string_expander::StringExpander, + string_parser, + }; + + const LEN_OWL: usize = if cfg!(target_os = "windows") { 2 } else { 4 }; + + #[test] + fn test_ascii_only_take_one_look_at_correct_data_and_end_behavior() { + let input = "hello"; + let cow = to_native_int_representation(OsStr::new(input)); + let mut uut = StringExpander::new(&cow); + for c in input.chars() { + assert_eq!(c, uut.get_parser().peek().unwrap()); + uut.take_one().unwrap(); + } + assert_eq!( + uut.get_parser().peek(), + Err(string_parser::Error { + peek_position: 5, + err_type: string_parser::ErrorType::EndOfInput + }) + ); + uut.take_one().unwrap_err(); + uut.take_one().unwrap_err(); + assert_eq!( + from_native_int_representation_owned(uut.take_collected_output()), + input + ); + uut.take_one().unwrap_err(); + assert_eq!( + from_native_int_representation_owned(uut.take_collected_output()), + "" + ); + } + + #[test] + fn test_multi_byte_codes_take_one_look_at_correct_data_and_end_behavior() { + let input = OsString::from("🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"); + let cow = to_native_int_representation(input.as_os_str()); + let mut uut = StringExpander::new(&cow); + for _i in 0..3 { + assert_eq!(uut.get_parser().peek().unwrap(), '\u{FFFD}'); + uut.take_one().unwrap(); + assert_eq!(uut.get_parser().peek().unwrap(), 'x'); + uut.take_one().unwrap(); + } + assert_eq!(uut.get_parser().peek().unwrap(), '\u{FFFD}'); + uut.take_one().unwrap(); + assert_eq!( + uut.get_parser().peek(), + Err(string_parser::Error { + peek_position: 10 * LEN_OWL + 3, + err_type: string_parser::ErrorType::EndOfInput + }) + ); + uut.take_one().unwrap_err(); + uut.take_one().unwrap_err(); + assert_eq!( + from_native_int_representation_owned(uut.take_collected_output()), + input + ); + uut.take_one().unwrap_err(); + assert_eq!( + from_native_int_representation_owned(uut.take_collected_output()), + "" + ); + } + + #[test] + fn test_multi_byte_codes_put_one_ascii_start_middle_end_try_invalid_ascii() { + let input = OsString::from("🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"); + let cow = to_native_int_representation(input.as_os_str()); + let owl: char = '🦉'; + let mut uut = StringExpander::new(&cow); + uut.put_one_char('a'); + for _i in 0..3 { + assert_eq!(uut.get_parser().peek().unwrap(), '\u{FFFD}'); + uut.take_one().unwrap(); + uut.put_one_char('a'); + assert_eq!(uut.get_parser().peek().unwrap(), 'x'); + uut.take_one().unwrap(); + uut.put_one_char('a'); + } + assert_eq!(uut.get_parser().peek().unwrap(), '\u{FFFD}'); + uut.take_one().unwrap(); + uut.put_one_char(owl); + uut.put_one_char('a'); + assert_eq!( + uut.get_parser().peek(), + Err(string_parser::Error { + peek_position: LEN_OWL * 10 + 3, + err_type: string_parser::ErrorType::EndOfInput + }) + ); + uut.take_one().unwrap_err(); + uut.put_one_char('a'); + uut.take_one().unwrap_err(); + assert_eq!( + from_native_int_representation_owned(uut.take_collected_output()), + "a🦉🦉🦉axa🦉🦉axa🦉axa🦉🦉🦉🦉🦉aa" + ); + uut.take_one().unwrap_err(); + assert_eq!( + from_native_int_representation_owned(uut.take_collected_output()), + "" + ); + } + + #[test] + fn test_multi_byte_codes_skip_one_take_one_skip_until_ascii_char_or_end() { + let input = OsString::from("🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"); + let cow = to_native_int_representation(input.as_os_str()); + let mut uut = StringExpander::new(&cow); + + uut.skip_one().unwrap(); // skip 🦉🦉🦉 + let p = LEN_OWL * 3; + assert_eq!(uut.get_peek_position(), p); + + uut.skip_one().unwrap(); // skip x + assert_eq!(uut.get_peek_position(), p + 1); + uut.take_one().unwrap(); // take 🦉🦉 + let p = p + 1 + LEN_OWL * 2; + assert_eq!(uut.get_peek_position(), p); + + uut.skip_one().unwrap(); // skip x + assert_eq!(uut.get_peek_position(), p + 1); + uut.get_parser_mut().skip_until_char_or_end('x'); // skip 🦉 + let p = p + 1 + LEN_OWL; + assert_eq!(uut.get_peek_position(), p); + uut.take_one().unwrap(); // take x + uut.get_parser_mut().skip_until_char_or_end('x'); // skip 🦉🦉🦉🦉 till end + let p = p + 1 + LEN_OWL * 4; + assert_eq!(uut.get_peek_position(), p); + + uut.take_one().unwrap_err(); + assert_eq!( + from_native_int_representation_owned(uut.take_collected_output()), + "🦉🦉x" + ); + } + + #[test] + fn test_multi_byte_codes_skip_multiple_ascii_bounded_good_and_bad() { + let input = OsString::from("🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"); + let cow = to_native_int_representation(input.as_os_str()); + let mut uut = StringExpander::new(&cow); + + uut.get_parser_mut().skip_multiple(0); + assert_eq!(uut.get_peek_position(), 0); + let p = LEN_OWL * 3; + uut.get_parser_mut().skip_multiple(p); // skips 🦉🦉🦉 + assert_eq!(uut.get_peek_position(), p); + + uut.take_one().unwrap(); // take x + assert_eq!(uut.get_peek_position(), p + 1); + let step = LEN_OWL * 3 + 1; + uut.get_parser_mut().skip_multiple(step); // skips 🦉🦉x🦉 + let p = p + 1 + step; + assert_eq!(uut.get_peek_position(), p); + uut.take_one().unwrap(); // take x + + assert_eq!(uut.get_peek_position(), p + 1); + let step = 4 * LEN_OWL; + uut.get_parser_mut().skip_multiple(step); // skips 🦉🦉🦉🦉 + let p = p + 1 + step; + assert_eq!(uut.get_peek_position(), p); + + uut.take_one().unwrap_err(); + assert_eq!( + from_native_int_representation_owned(uut.take_collected_output()), + "xx" + ); + } + + #[test] + fn test_multi_byte_codes_put_string_utf8_start_middle_end() { + let input = OsString::from("🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"); + let cow = to_native_int_representation(input.as_os_str()); + let mut uut = StringExpander::new(&cow); + + uut.put_string("🦔oo"); + uut.take_one().unwrap(); // takes 🦉🦉🦉 + uut.put_string("oo🦔"); + uut.take_one().unwrap(); // take x + uut.get_parser_mut().skip_until_char_or_end('\n'); // skips till end + uut.put_string("o🦔o"); + + uut.take_one().unwrap_err(); + assert_eq!( + from_native_int_representation_owned(uut.take_collected_output()), + "🦔oo🦉🦉🦉oo🦔xo🦔o" + ); + } + + #[test] + fn test_multi_byte_codes_look_at_remaining_start_middle_end() { + let input = "🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"; + let cow = to_native_int_representation(OsStr::new(input)); + let mut uut = StringExpander::new(&cow); + + assert_eq!(uut.get_parser().peek_remaining(), OsStr::new(input)); + uut.take_one().unwrap(); // takes 🦉🦉🦉 + assert_eq!(uut.get_parser().peek_remaining(), OsStr::new(&input[12..])); + uut.get_parser_mut().skip_until_char_or_end('\n'); // skips till end + assert_eq!(uut.get_parser().peek_remaining(), OsStr::new("")); + + uut.take_one().unwrap_err(); + assert_eq!( + from_native_int_representation_owned(uut.take_collected_output()), + "🦉🦉🦉" + ); + } + + #[test] + fn test_deal_with_invalid_encoding() { + let owl_invalid_part; + let (brace_1, brace_2); + #[cfg(target_os = "windows")] + { + let mut buffer = [0u16; 2]; + let owl = '🦉'.encode_utf16(&mut buffer); + owl_invalid_part = owl[0]; + brace_1 = '<'.encode_utf16(&mut buffer).to_vec(); + brace_2 = '>'.encode_utf16(&mut buffer).to_vec(); + } + #[cfg(not(target_os = "windows"))] + { + let mut buffer = [0u8; 4]; + let owl = '🦉'.encode_utf8(&mut buffer); + owl_invalid_part = owl.bytes().next().unwrap(); + brace_1 = [b'<'].to_vec(); + brace_2 = [b'>'].to_vec(); + } + let mut input_ux = brace_1; + input_ux.push(owl_invalid_part); + input_ux.extend(brace_2); + let input_str = from_native_int_representation(Cow::Borrowed(&input_ux)); + let mut uut = StringExpander::new(&input_ux); + + assert_eq!(uut.get_parser().peek_remaining(), input_str); + assert_eq!(uut.get_parser().peek().unwrap(), '<'); + uut.take_one().unwrap(); // takes "<" + assert_eq!( + uut.get_parser().peek_remaining(), + NativeStr::new(&input_str).split_at(1).1 + ); + assert_eq!(uut.get_parser().peek().unwrap(), '\u{FFFD}'); + uut.take_one().unwrap(); // takes owl_b + assert_eq!( + uut.get_parser().peek_remaining(), + NativeStr::new(&input_str).split_at(2).1 + ); + assert_eq!(uut.get_parser().peek().unwrap(), '>'); + uut.get_parser_mut().skip_until_char_or_end('\n'); + assert_eq!(uut.get_parser().peek_remaining(), OsStr::new("")); + + uut.take_one().unwrap_err(); + assert_eq!( + from_native_int_representation_owned(uut.take_collected_output()), + NativeStr::new(&input_str).split_at(2).0 + ); + } +} diff --git a/tests/common/util.rs b/tests/common/util.rs index 44f364b916a..1d9786bbef8 100644 --- a/tests/common/util.rs +++ b/tests/common/util.rs @@ -15,7 +15,6 @@ use pretty_assertions::assert_eq; use rlimit::setrlimit; #[cfg(feature = "sleep")] use rstest::rstest; -#[cfg(unix)] use std::borrow::Cow; use std::collections::VecDeque; #[cfg(not(windows))] @@ -352,6 +351,11 @@ impl CmdResult { std::str::from_utf8(&self.stderr).unwrap() } + /// Returns the program's standard error as a string slice, automatically handling invalid utf8 + pub fn stderr_str_lossy(&self) -> Cow<'_, str> { + String::from_utf8_lossy(&self.stderr) + } + /// Returns the program's standard error as a string /// consumes self pub fn stderr_move_str(self) -> String { @@ -372,6 +376,14 @@ impl CmdResult { #[track_caller] pub fn code_is(&self, expected_code: i32) -> &Self { + let fails = self.code() != expected_code; + if fails { + eprintln!( + "stdout:\n{}\nstderr:\n{}", + self.stdout_str(), + self.stderr_str() + ); + } assert_eq!(self.code(), expected_code); self } @@ -395,10 +407,8 @@ impl CmdResult { pub fn success(&self) -> &Self { assert!( self.succeeded(), - "Command was expected to succeed. Exit code: {}.\nstdout = {}\n stderr = {}", - self.exit_status() - .code() - .map_or("n/a".to_string(), |code| code.to_string()), + "Command was expected to succeed. code: {}\nstdout = {}\n stderr = {}", + self.code(), self.stdout_str(), self.stderr_str() ); @@ -2674,7 +2684,7 @@ pub fn expected_result(ts: &TestScenario, args: &[&str]) -> std::result::Result< let (stdout, stderr): (String, String) = if cfg!(target_os = "linux") { ( result.stdout_str().to_string(), - result.stderr_str().to_string(), + result.stderr_str_lossy().to_string(), ) } else { // `host_name_for` added prefix, strip 'g' prefix from results: @@ -2682,7 +2692,7 @@ pub fn expected_result(ts: &TestScenario, args: &[&str]) -> std::result::Result< let to = &from[1..]; ( result.stdout_str().replace(&from, to), - result.stderr_str().replace(&from, to), + result.stderr_str_lossy().replace(&from, to), ) }; diff --git a/tests/fixtures/env/runBat.bat b/tests/fixtures/env/runBat.bat new file mode 100644 index 00000000000..63ab744d3ab --- /dev/null +++ b/tests/fixtures/env/runBat.bat @@ -0,0 +1 @@ +echo Hello Windows World! diff --git a/util/build-gnu.sh b/util/build-gnu.sh index 1faf1e66be5..876e645faed 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -221,6 +221,8 @@ grep -rlE '/usr/local/bin/\s?/usr/local/bin' init.cfg tests/* | xargs -r sed -Ei # we should not regress our project just to match what GNU is going. # So, do some changes on the fly +patch -N -r - -d "$path_GNU" -p 1 -i "`realpath \"$path_UUTILS/util/gnu-patches/tests_env_env-S.pl.patch\"`" || true + sed -i -e "s|rm: cannot remove 'e/slink'|rm: cannot remove 'e'|g" tests/rm/fail-eacces.sh sed -i -e "s|rm: cannot remove 'a/b'|rm: cannot remove 'a'|g" tests/rm/fail-2eperm.sh diff --git a/util/gnu-patches/tests_env_env-S.pl.patch b/util/gnu-patches/tests_env_env-S.pl.patch new file mode 100644 index 00000000000..404a00ca60e --- /dev/null +++ b/util/gnu-patches/tests_env_env-S.pl.patch @@ -0,0 +1,47 @@ +diff --git a/tests/env/env-S.pl b/tests/env/env-S.pl +index 710ca82cf..af7cf6efa 100755 +--- a/tests/env/env-S.pl ++++ b/tests/env/env-S.pl +@@ -209,27 +209,28 @@ my @Tests = + {ERR=>"$prog: no terminating quote in -S string\n"}], + ['err5', q[-S'A=B\\q'], {EXIT=>125}, + {ERR=>"$prog: invalid sequence '\\q' in -S\n"}], +- ['err6', q[-S'A=$B'], {EXIT=>125}, +- {ERR=>"$prog: only \${VARNAME} expansion is supported, error at: \$B\n"}], ++ ['err6', q[-S'A=$B echo hello'], {EXIT=>0}, ++ {OUT=>"hello"}], + ['err7', q[-S'A=${B'], {EXIT=>125}, +- {ERR=>"$prog: only \${VARNAME} expansion is supported, " . +- "error at: \${B\n"}], ++ {ERR=>"$prog" . qq[: variable name issue (at 5): Missing closing brace\n]}], + ['err8', q[-S'A=${B%B}'], {EXIT=>125}, +- {ERR=>"$prog: only \${VARNAME} expansion is supported, " . +- "error at: \${B%B}\n"}], ++ {ERR=>"$prog" . qq[: variable name issue (at 5): Unexpected character: '%', expected a closing brace ('}') or colon (':')\n]}], + ['err9', q[-S'A=${9B}'], {EXIT=>125}, +- {ERR=>"$prog: only \${VARNAME} expansion is supported, " . +- "error at: \${9B}\n"}], ++ {ERR=>"$prog" . qq[: variable name issue (at 4): Unexpected character: '9', expected variable name must not start with 0..9\n]}], + + # Test incorrect shebang usage (extraneous whitespace). + ['err_sp2', q['-v -S cat -n'], {EXIT=>125}, +- {ERR=>"env: invalid option -- ' '\n" . +- "env: use -[v]S to pass options in shebang lines\n" . +- "Try 'env --help' for more information.\n"}], ++ {ERR=>"$prog: error: unexpected argument '- ' found\n\n" . ++ " tip: to pass '- ' as a value, use '-- - '\n\n" . ++ "Usage: $prog [OPTION]... [-] [NAME=VALUE]... [COMMAND [ARG]...]\n\n" . ++ "For more information, try '--help'.\n" . ++ "$prog: use -[v]S to pass options in shebang lines\n"}], + ['err_sp3', q['-v -S cat -n'], {EXIT=>125}, # embedded tab after -v +- {ERR=>"env: invalid option -- '\t'\n" . +- "env: use -[v]S to pass options in shebang lines\n" . +- "Try 'env --help' for more information.\n"}], ++ {ERR=>"$prog: error: unexpected argument '-\t' found\n\n" . ++ " tip: to pass '-\t' as a value, use '-- -\t'\n\n" . ++ "Usage: $prog [OPTION]... [-] [NAME=VALUE]... [COMMAND [ARG]...]\n\n" . ++ "For more information, try '--help'.\n" . ++ "$prog: use -[v]S to pass options in shebang lines\n"}], + + # Also diagnose incorrect shebang usage when failing to exec. + # This typically happens with: