diff --git a/Cargo.lock b/Cargo.lock index 202a6b25d..ff730c01f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2207,6 +2207,7 @@ dependencies = [ "libc", "libc-stdhandle", "metrics", + "percent-encoding", "pin-project", "proptest", "rand", @@ -2219,7 +2220,6 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", - "urlencoding", "xmltree", ] diff --git a/s3-client/Cargo.toml b/s3-client/Cargo.toml index 24c4e2b8d..ea67e29b7 100644 --- a/s3-client/Cargo.toml +++ b/s3-client/Cargo.toml @@ -16,13 +16,13 @@ lazy_static = "1.4.0" libc = "0.2.126" libc-stdhandle = "0.1.0" metrics = "0.20.1" +percent-encoding = "2.2.0" pin-project = "1.0.12" regex = "1.7.1" static_assertions = "1.1.0" thiserror = "1.0.34" time = "0.3.14" tracing = { version = "0.1.35", default-features = false, features = ["std", "log"] } -urlencoding = "2.1.2" xmltree = "0.10.3" [dev-dependencies] diff --git a/s3-client/src/s3_crt_client.rs b/s3-client/src/s3_crt_client.rs index d7f92fe1a..633824fed 100644 --- a/s3-client/src/s3_crt_client.rs +++ b/s3-client/src/s3_crt_client.rs @@ -1,6 +1,7 @@ use std::ffi::{OsStr, OsString}; use std::future::Future; use std::ops::Range; +use std::os::unix::prelude::OsStrExt; use std::pin::Pin; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; @@ -22,6 +23,7 @@ use aws_crt_s3::s3::client::{ use async_trait::async_trait; use futures::channel::oneshot; +use percent_encoding::{percent_encode, AsciiSet, NON_ALPHANUMERIC}; use pin_project::pin_project; use thiserror::Error; use tracing::{debug, error, trace, warn, Span}; @@ -358,14 +360,61 @@ impl<'a> S3Message<'a> { self.inner.add_header(header) } - /// Set the request path for this message. - fn set_request_path(&mut self, path: impl AsRef) -> Result<(), aws_crt_s3::common::error::Error> { - let mut full_path = OsString::with_capacity(self.path_prefix.len() + path.as_ref().len()); - full_path.push(&self.path_prefix); - full_path.push(path); + /// Set the request path and query for this message. The components should not be URL-encoded; + /// this method will handle that. + fn set_request_path_and_query>( + &mut self, + path: impl AsRef, + query: impl AsRef<[(P, P)]>, + ) -> Result<(), aws_crt_s3::common::error::Error> { + // This is RFC 3986 but with '/' also considered a safe character for path fragments. + const URLENCODE_QUERY_FRAGMENT: &AsciiSet = + &NON_ALPHANUMERIC.remove(b'-').remove(b'.').remove(b'_').remove(b'~'); + const URLENCODE_PATH_FRAGMENT: &AsciiSet = &URLENCODE_QUERY_FRAGMENT.remove(b'/'); + + fn write_encoded_fragment(s: &mut OsString, piece: impl AsRef, encoding: &'static AsciiSet) { + let iter = percent_encode(piece.as_ref().as_bytes(), encoding); + s.extend(iter.map(|s| OsStr::from_bytes(s.as_bytes()))); + } + + // This estimate is exact if no characters need encoding, otherwise we'll end up + // reallocating a couple of times. The '?' for the query is counted in the first key-value + // pair. + let space_needed = self.path_prefix.len() + + path.as_ref().len() + + query + .as_ref() + .iter() + .map(|(key, value)| key.as_ref().len() + value.as_ref().len() + 2) // +2 for & and = + .sum::(); + + let mut full_path = OsString::with_capacity(space_needed); + + write_encoded_fragment(&mut full_path, &self.path_prefix, URLENCODE_PATH_FRAGMENT); + write_encoded_fragment(&mut full_path, &path, URLENCODE_PATH_FRAGMENT); + + // Build the query string + if !query.as_ref().is_empty() { + full_path.push("?"); + for (i, (key, value)) in query.as_ref().iter().enumerate() { + if i != 0 { + full_path.push("&"); + } + write_encoded_fragment(&mut full_path, key, URLENCODE_QUERY_FRAGMENT); + full_path.push("="); + write_encoded_fragment(&mut full_path, value, URLENCODE_QUERY_FRAGMENT); + } + } + self.inner.set_request_path(full_path) } + /// Set the request path for this message. The path should not be URL-encoded; this method will + /// handle that. + fn set_request_path(&mut self, path: impl AsRef) -> Result<(), aws_crt_s3::common::error::Error> { + self.set_request_path_and_query::<&str>(path, &[]) + } + /// Sets the body input stream for this message, and returns any previously set input stream. /// If input_stream is None, unsets the body. fn set_body_stream(&mut self, input_stream: Option>) -> Option> { diff --git a/s3-client/src/s3_crt_client/head_object.rs b/s3-client/src/s3_crt_client/head_object.rs index c8d1e4a1c..299faf8d2 100644 --- a/s3-client/src/s3_crt_client/head_object.rs +++ b/s3-client/src/s3_crt_client/head_object.rs @@ -68,7 +68,6 @@ impl S3CrtClient { .new_request_template("HEAD", bucket) .map_err(S3RequestError::construction_failure)?; - // Don't URI encode the key, since "/" needs to be preserved let key = key.to_string(); message .set_request_path(format!("/{key}")) diff --git a/s3-client/src/s3_crt_client/list_objects.rs b/s3-client/src/s3_crt_client/list_objects.rs index 4d21bb964..9bfcf9eaf 100644 --- a/s3-client/src/s3_crt_client/list_objects.rs +++ b/s3-client/src/s3_crt_client/list_objects.rs @@ -143,17 +143,19 @@ impl S3CrtClient { .new_request_template("GET", bucket) .map_err(S3RequestError::construction_failure)?; - // Don't URI encode delimiter or prefix, since "/" in those needs to be a real "/". - let mut request = format!("/?list-type=2&delimiter={delimiter}&max-keys={max_keys}&prefix={prefix}"); - + let max_keys = format!("{max_keys}"); + let mut query = vec![ + ("list-type", "2"), + ("delimiter", delimiter), + ("max-keys", &max_keys), + ("prefix", prefix), + ]; if let Some(continuation_token) = continuation_token { - // DO URI encode the continuation token, since "/" in it needs to become "%2F" - let continuation_token = urlencoding::encode(continuation_token); - request = format!("{request}&continuation-token={continuation_token}"); + query.push(("continuation-token", continuation_token)); } message - .set_request_path(request) + .set_request_path_and_query("/", query) .map_err(S3RequestError::construction_failure)?; let span = request_span!(self, "list_objects"); diff --git a/s3-client/tests/list_objects.rs b/s3-client/tests/list_objects.rs index 3bb8119b6..f09490081 100644 --- a/s3-client/tests/list_objects.rs +++ b/s3-client/tests/list_objects.rs @@ -118,3 +118,49 @@ async fn test_list_objects_404_bucket() { Err(ObjectClientError::ServiceError(ListObjectsError::NoSuchBucket)) )); } + +// Test list with keys and arguments that poke at URL encoding +#[tokio::test] +async fn test_interesting_keys() { + let keys = &[ + "the first one@@@", + "the first one@@@/the 1st one!$%#@?_.-=&+^", + "the first one@@@/the 2nd+one!", + "the first one@@@/the 3rd one&", + ]; + let sdk_client = get_test_sdk_client().await; + let (bucket, prefix) = get_test_bucket_and_prefix("test_list_objects"); + create_objects_for_test(&sdk_client, &bucket, &prefix, keys).await; + + let client: S3CrtClient = get_test_client(); + + let result = client + .list_objects(&bucket, None, "/", 2, &prefix) + .await + .expect("ListObjects failed"); + assert_eq!(result.common_prefixes[0], format!("{prefix}{}/", keys[0])); + assert_eq!(result.objects[0].key, format!("{prefix}{}", keys[0])); + + let result = client + .list_objects(&bucket, None, "/", 1, &format!("{prefix}{}/", keys[0])) + .await + .expect("ListObjects failed"); + assert_eq!(result.objects.len(), 1); + assert_eq!(result.objects[0].key, format!("{prefix}{}", keys[1])); + assert!(result.next_continuation_token.is_some()); + + let result = client + .list_objects( + &bucket, + result.next_continuation_token.as_deref(), + "/", + 1000, + &format!("{prefix}{}/", keys[0]), + ) + .await + .expect("ListObjects failed"); + assert_eq!(result.objects[0].key, format!("{prefix}{}", keys[2])); + assert_eq!(result.objects[1].key, format!("{prefix}{}", keys[3])); + assert_eq!(result.objects.len(), 2); + assert!(result.next_continuation_token.is_none()); +} diff --git a/s3-file-connector/tests/fuse_tests/lookup_test.rs b/s3-file-connector/tests/fuse_tests/lookup_test.rs index 64898e3b1..b09efd737 100644 --- a/s3-file-connector/tests/fuse_tests/lookup_test.rs +++ b/s3-file-connector/tests/fuse_tests/lookup_test.rs @@ -1,4 +1,4 @@ -use std::fs::{metadata, read_dir}; +use std::fs::{metadata, read_dir, read_to_string}; use fuser::BackgroundSession; use s3_file_connector::S3FilesystemConfig; @@ -47,3 +47,64 @@ fn lookup_directory_overlap_test_s3(subdir: &str) { fn lookup_directory_overlap_test_mock(prefix: &str, subdir: &str) { lookup_directory_overlap_test(crate::fuse_tests::mock_session::new, prefix, subdir); } + +fn lookup_weird_characters_test(creator_fn: F, prefix: &str) +where + F: FnOnce(&str, S3FilesystemConfig) -> (TempDir, BackgroundSession, PutObjectFn), +{ + let (mount_point, _session, mut put_object_fn) = creator_fn(prefix, Default::default()); + + let keys = &[ + "weird$dir name", + "weird$dir name/my 1st file~.jpg", + "weird$dir name/my 2nd file: the better one.jpg", + "weirder_.-@dir +name", + "weirder_.-@dir +name/", + ]; + + for (i, key) in keys.iter().enumerate() { + put_object_fn(key, format!("hello world {i}").as_bytes()).unwrap(); + } + + let test_dir = read_dir(mount_point.path()).unwrap(); + let dirs: Vec<_> = test_dir.map(|f| f.unwrap()).collect(); + + assert_eq!( + dirs.iter() + .map(|f| f.path().file_name().unwrap().to_str().unwrap().to_owned()) + .collect::>(), + vec!["weird$dir name", "weirder_.-@dir +name"] + ); + + let m = metadata(mount_point.path().join(keys[0])).unwrap(); + assert!(m.file_type().is_dir()); + let m = metadata(mount_point.path().join(keys[3])).unwrap(); + assert!(m.file_type().is_dir()); + + let test_dir = read_dir(mount_point.path().join(keys[0])).unwrap(); + let dirs: Vec<_> = test_dir.map(|f| f.unwrap()).collect(); + + assert_eq!( + dirs.iter() + .map(|f| f.path().file_name().unwrap().to_str().unwrap().to_owned()) + .collect::>(), + vec!["my 1st file~.jpg", "my 2nd file: the better one.jpg"] + ); + + let f = read_to_string(mount_point.path().join(keys[1])).unwrap(); + assert_eq!(f, "hello world 1"); + + let f = read_to_string(mount_point.path().join(keys[2])).unwrap(); + assert_eq!(f, "hello world 2"); +} + +#[cfg(feature = "s3_tests")] +#[test] +fn lookup_directory_weird_characters_s3() { + lookup_weird_characters_test(crate::fuse_tests::s3_session::new, "lookup_weird_characters_test"); +} + +#[test] +fn lookup_directory_weird_characters_mock() { + lookup_weird_characters_test(crate::fuse_tests::mock_session::new, "lookup_weird_characters_test"); +}