From 2075df99cc3fba08a888332b519e8b03bfe469c0 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 21 Nov 2023 15:57:22 +0100 Subject: [PATCH 1/6] PVF: Fix unshare "no such file or directory" error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In https://github.com/paritytech/polkadot-sdk/pull/2304 we introduced this bug where unshare will always fail, leading to a scary error message for validators. Not sure how this wasn't caught in testing, neither for me nor the user who reported #2304. 🙈 Closes #2320 --- polkadot/node/core/pvf/src/artifacts.rs | 5 +- .../node/core/pvf/src/execute/worker_intf.rs | 4 +- .../node/core/pvf/src/prepare/worker_intf.rs | 4 +- polkadot/node/core/pvf/src/security.rs | 17 ++- polkadot/node/core/pvf/src/worker_intf.rs | 105 ++++++++---------- 5 files changed, 60 insertions(+), 75 deletions(-) diff --git a/polkadot/node/core/pvf/src/artifacts.rs b/polkadot/node/core/pvf/src/artifacts.rs index 53085eade3cb..79b53467b4e3 100644 --- a/polkadot/node/core/pvf/src/artifacts.rs +++ b/polkadot/node/core/pvf/src/artifacts.rs @@ -499,8 +499,7 @@ mod tests { #[tokio::test] async fn remove_stale_cache_on_startup() { - let cache_dir = crate::worker_intf::tmppath("test-cache").await.unwrap(); - fs::create_dir_all(&cache_dir).unwrap(); + let cache_dir = tempfile::Builder::new().prefix("test-cache-").tempdir().unwrap(); // invalid prefix create_rand_artifact(&cache_dir, ""); @@ -529,7 +528,7 @@ mod tests { assert_eq!(fs::read_dir(&cache_dir).unwrap().count(), 7); - let artifacts = Artifacts::new_and_prune(&cache_dir).await; + let artifacts = Artifacts::new_and_prune(cache_dir.path()).await; assert_eq!(fs::read_dir(&cache_dir).unwrap().count(), 1); assert_eq!(artifacts.len(), 1); diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 4faaf13e62f7..fc6cd8fc7256 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -278,7 +278,7 @@ where // Cheaply create a hard link to the artifact. The artifact is always at a known location in the // worker cache, and the child can't access any other artifacts or gain any information from the // original filename. - let link_path = worker_dir::execute_artifact(&worker_dir.path); + let link_path = worker_dir::execute_artifact(worker_dir.path()); if let Err(err) = tokio::fs::hard_link(artifact_path, link_path).await { gum::warn!( target: LOG_TARGET, @@ -292,7 +292,7 @@ where } } - let worker_dir_path = worker_dir.path.clone(); + let worker_dir_path = worker_dir.path().to_owned(); let outcome = f(worker_dir).await; // Try to clear the worker dir. diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 318aea7295de..f978005068c8 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -318,7 +318,7 @@ where { // Create the tmp file here so that the child doesn't need any file creation rights. This will // be cleared at the end of this function. - let tmp_file = worker_dir::prepare_tmp_artifact(&worker_dir.path); + let tmp_file = worker_dir::prepare_tmp_artifact(worker_dir.path()); if let Err(err) = tokio::fs::File::create(&tmp_file).await { gum::warn!( target: LOG_TARGET, @@ -333,7 +333,7 @@ where } }; - let worker_dir_path = worker_dir.path.clone(); + let worker_dir_path = worker_dir.path().to_owned(); let outcome = f(tmp_file, stream, worker_dir).await; // Try to clear the worker dir. diff --git a/polkadot/node/core/pvf/src/security.rs b/polkadot/node/core/pvf/src/security.rs index 8c06c68392f4..8ae09158fe7a 100644 --- a/polkadot/node/core/pvf/src/security.rs +++ b/polkadot/node/core/pvf/src/security.rs @@ -158,18 +158,15 @@ async fn check_can_unshare_user_namespace_and_change_root( ) -> SecureModeResult { cfg_if::cfg_if! { if #[cfg(target_os = "linux")] { - let cache_dir_tempdir = - crate::worker_intf::tmppath_in("check-can-unshare", cache_path) - .await - .map_err( - |err| - SecureModeError::CannotUnshareUserNamespaceAndChangeRoot( - format!("could not create a temporary directory in {:?}: {}", cache_path, err) - ) - )?; + let cache_dir_tempdir = tempfile::Builder::new() + .prefix("check-can-unshare-") + .tempdir_in(cache_path) + .map_err(|err| SecureModeError::CannotUnshareUserNamespaceAndChangeRoot( + format!("could not create a temporary directory in {:?}: {}", cache_path, err) + ))?; match tokio::process::Command::new(prepare_worker_program_path) .arg("--check-can-unshare-user-namespace-and-change-root") - .arg(cache_dir_tempdir) + .arg(cache_dir_tempdir.path()) .output() .await { diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index 5e589b9abcee..9d6907c10929 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -71,6 +71,7 @@ pub async fn spawn_with_program_path( with_transient_socket_path(debug_id, |socket_path| { let socket_path = socket_path.to_owned(); + let worker_dir_path = worker_dir.path().to_owned(); async move { let listener = UnixListener::bind(&socket_path).map_err(|err| { @@ -91,7 +92,7 @@ pub async fn spawn_with_program_path( &program_path, &extra_args, &socket_path, - &worker_dir.path, + &worker_dir_path, security_status, ) .map_err(|err| { @@ -100,7 +101,7 @@ pub async fn spawn_with_program_path( %debug_id, ?program_path, ?extra_args, - ?worker_dir.path, + ?worker_dir_path, ?socket_path, "cannot spawn a worker: {:?}", err, @@ -108,7 +109,6 @@ pub async fn spawn_with_program_path( SpawnErr::ProcessSpawn })?; - let worker_dir_path = worker_dir.path.clone(); futures::select! { accept_result = listener.accept().fuse() => { let (stream, _) = accept_result.map_err(|err| { @@ -150,7 +150,42 @@ where F: FnOnce(&Path) -> Fut, Fut: futures::Future> + 'static, { - let socket_path = tmppath(&format!("pvf-host-{}", debug_id)) + /// Returns a path under [`std::env::temp_dir`]. The path name will start with the given prefix. + /// + /// There is only a certain number of retries. If exceeded this function will give up and return + /// an error. + pub async fn tmppath(prefix: &str) -> io::Result { + fn make_tmppath(prefix: &str, dir: &Path) -> PathBuf { + use rand::distributions::Alphanumeric; + + const DESCRIMINATOR_LEN: usize = 10; + + let mut buf = Vec::with_capacity(prefix.len() + DESCRIMINATOR_LEN); + buf.extend(prefix.as_bytes()); + buf.extend(rand::thread_rng().sample_iter(&Alphanumeric).take(DESCRIMINATOR_LEN)); + + let s = std::str::from_utf8(&buf) + .expect("the string is collected from a valid utf-8 sequence; qed"); + + let mut path = dir.to_owned(); + path.push(s); + path + } + + const NUM_RETRIES: usize = 50; + + let dir = std::env::temp_dir(); + for _ in 0..NUM_RETRIES { + let tmp_path = make_tmppath(prefix, &dir); + if !tmp_path.exists() { + return Ok(tmp_path) + } + } + + Err(io::Error::new(io::ErrorKind::Other, "failed to create a temporary path")) + } + + let socket_path = tmppath(&format!("pvf-host-{}-", debug_id)) .await .map_err(|_| SpawnErr::TmpPath)?; let result = f(&socket_path).await; @@ -162,46 +197,6 @@ where result } -/// Returns a path under the given `dir`. The path name will start with the given prefix. -/// -/// There is only a certain number of retries. If exceeded this function will give up and return an -/// error. -pub async fn tmppath_in(prefix: &str, dir: &Path) -> io::Result { - fn make_tmppath(prefix: &str, dir: &Path) -> PathBuf { - use rand::distributions::Alphanumeric; - - const DESCRIMINATOR_LEN: usize = 10; - - let mut buf = Vec::with_capacity(prefix.len() + DESCRIMINATOR_LEN); - buf.extend(prefix.as_bytes()); - buf.extend(rand::thread_rng().sample_iter(&Alphanumeric).take(DESCRIMINATOR_LEN)); - - let s = std::str::from_utf8(&buf) - .expect("the string is collected from a valid utf-8 sequence; qed"); - - let mut path = dir.to_owned(); - path.push(s); - path - } - - const NUM_RETRIES: usize = 50; - - for _ in 0..NUM_RETRIES { - let tmp_path = make_tmppath(prefix, dir); - if !tmp_path.exists() { - return Ok(tmp_path) - } - } - - Err(io::Error::new(io::ErrorKind::Other, "failed to create a temporary path")) -} - -/// The same as [`tmppath_in`], but uses [`std::env::temp_dir`] as the directory. -pub async fn tmppath(prefix: &str) -> io::Result { - let temp_dir = std::env::temp_dir(); - tmppath_in(prefix, &temp_dir).await -} - /// A struct that represents an idle worker. /// /// This struct is supposed to be used as a token that is passed by move into a subroutine that @@ -224,8 +219,6 @@ pub struct IdleWorker { pub enum SpawnErr { /// Cannot obtain a temporary path location. TmpPath, - /// An FS error occurred. - Fs(String), /// Cannot bind the socket to the given path. Bind, /// An error happened during accepting a connection to the socket. @@ -419,26 +412,22 @@ pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result /// ``` #[derive(Debug)] pub struct WorkerDir { - pub path: PathBuf, + tempdir: tempfile::TempDir, } impl WorkerDir { /// Creates a new, empty worker dir with a random name in the given cache dir. pub async fn new(debug_id: &'static str, cache_dir: &Path) -> Result { let prefix = format!("worker-dir-{}-", debug_id); - let path = tmppath_in(&prefix, cache_dir).await.map_err(|_| SpawnErr::TmpPath)?; - tokio::fs::create_dir(&path) - .await - .map_err(|err| SpawnErr::Fs(err.to_string()))?; - Ok(Self { path }) + let tempdir = tempfile::Builder::new() + .prefix(&prefix) + .tempdir_in(cache_dir) + .map_err(|_| SpawnErr::TmpPath)?; + Ok(Self { tempdir }) } -} -// Try to clean up the temporary worker dir at the end of the worker's lifetime. It should be wiped -// on startup, but we make a best effort not to leave it around. -impl Drop for WorkerDir { - fn drop(&mut self) { - let _ = std::fs::remove_dir_all(&self.path); + pub fn path(&self) -> &Path { + self.tempdir.path() } } From 688f44dd72c08f21de68f480e2ece3d14716a380 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 21 Nov 2023 16:42:20 +0100 Subject: [PATCH 2/6] Add test for security feature detection --- polkadot/node/core/pvf/src/host.rs | 9 ++++++--- polkadot/node/core/pvf/src/security.rs | 2 +- polkadot/node/core/pvf/tests/it/main.rs | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 6049f51834e3..be8f7aee7784 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -24,7 +24,7 @@ use crate::{ artifacts::{ArtifactId, ArtifactPathId, ArtifactState, Artifacts}, execute::{self, PendingExecutionRequest}, metrics::Metrics, - prepare, security, Priority, ValidationError, LOG_TARGET, + prepare, security, Priority, SecurityStatus, ValidationError, LOG_TARGET, }; use always_assert::never; use futures::{ @@ -70,6 +70,8 @@ pub(crate) type PrecheckResultSender = oneshot::Sender; #[derive(Clone)] pub struct ValidationHost { to_host_tx: mpsc::Sender, + /// Available security features, detected by the host during startup. + pub security_status: SecurityStatus, } impl ValidationHost { @@ -216,7 +218,7 @@ pub async fn start( let (to_host_tx, to_host_rx) = mpsc::channel(10); - let validation_host = ValidationHost { to_host_tx }; + let validation_host = ValidationHost { to_host_tx, security_status: security_status.clone() }; let (to_prepare_pool, from_prepare_pool, run_prepare_pool) = prepare::start_pool( metrics.clone(), @@ -978,7 +980,8 @@ pub(crate) mod tests { fn host_handle(&mut self) -> ValidationHost { let to_host_tx = self.to_host_tx.take().unwrap(); - ValidationHost { to_host_tx } + let security_status = Default::default(); + ValidationHost { to_host_tx, security_status } } async fn poll_and_recv_result(&mut self, result_rx: oneshot::Receiver) -> T diff --git a/polkadot/node/core/pvf/src/security.rs b/polkadot/node/core/pvf/src/security.rs index 8ae09158fe7a..ef8714f58c81 100644 --- a/polkadot/node/core/pvf/src/security.rs +++ b/polkadot/node/core/pvf/src/security.rs @@ -28,7 +28,7 @@ const SECURE_MODE_ANNOUNCEMENT: &'static str = /// Run checks for supported security features. /// -/// # Return +/// # Returns /// /// Returns the set of security features that we were able to enable. If an error occurs while /// enabling a security feature we set the corresponding status to `false`. diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 4c81ac502dd4..f647b392faa1 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -402,3 +402,20 @@ async fn prepare_can_run_serially() { // Prepare a different wasm blob to prevent skipping work. let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), Default::default()).await.unwrap(); } + +// CI machines should be able to enable all the security features. +#[cfg(all(feature = "ci-only-tests", target_os = "linux"))] +#[tokio::test] +fn all_security_features_work() { + use polkadot_node_core_pvf::SecurityStatus; + + let host = TestHost::new().await; + assert_eq!( + host.security_status, + SecurityStatus { + can_enable_landlock: true, + can_enable_seccomp: true, + can_unshare_user_namespace_and_change_root: true, + } + ); +} From 7028485e707fa203c5e126f4ab71c83e4e6b36fc Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 21 Nov 2023 16:53:45 +0100 Subject: [PATCH 3/6] Fix test; fix CI-only tests not running --- .gitlab/pipeline/test.yml | 2 +- polkadot/node/core/pvf/common/src/lib.rs | 2 +- polkadot/node/core/pvf/tests/it/main.rs | 13 +++++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.gitlab/pipeline/test.yml b/.gitlab/pipeline/test.yml index 4ed3ec19c48a..79ef9070dba9 100644 --- a/.gitlab/pipeline/test.yml +++ b/.gitlab/pipeline/test.yml @@ -29,7 +29,7 @@ test-linux-stable: --locked \ --release \ --no-fail-fast \ - --features try-runtime,experimental \ + --features try-runtime,experimental,ci-only-tests \ --partition count:${CI_NODE_INDEX}/${CI_NODE_TOTAL} # Upload tests results to Elasticsearch - echo "Upload test results to Elasticsearch" diff --git a/polkadot/node/core/pvf/common/src/lib.rs b/polkadot/node/core/pvf/common/src/lib.rs index 278fa3fe821c..c041c60aaf9c 100644 --- a/polkadot/node/core/pvf/common/src/lib.rs +++ b/polkadot/node/core/pvf/common/src/lib.rs @@ -47,7 +47,7 @@ pub mod tests { } /// Status of security features on the current system. -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone, Default, PartialEq, Eq)] pub struct SecurityStatus { /// Whether the landlock features we use are fully available on this system. pub can_enable_landlock: bool, diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index f647b392faa1..7770d37825e4 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -18,6 +18,8 @@ use assert_matches::assert_matches; use parity_scale_codec::Encode as _; +#[cfg(all(feature = "ci-only-tests", target_os = "linux"))] +use polkadot_node_core_pvf::SecurityStatus; use polkadot_node_core_pvf::{ start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, PrepareError, PrepareJobKind, PvfPrepData, ValidationError, ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR, @@ -122,6 +124,11 @@ impl TestHost { .unwrap(); result_rx.await.unwrap() } + + #[cfg(all(feature = "ci-only-tests", target_os = "linux"))] + async fn security_status(&self) -> SecurityStatus { + self.host.lock().await.security_status.clone() + } } #[tokio::test] @@ -406,12 +413,10 @@ async fn prepare_can_run_serially() { // CI machines should be able to enable all the security features. #[cfg(all(feature = "ci-only-tests", target_os = "linux"))] #[tokio::test] -fn all_security_features_work() { - use polkadot_node_core_pvf::SecurityStatus; - +async fn all_security_features_work() { let host = TestHost::new().await; assert_eq!( - host.security_status, + host.security_status().await, SecurityStatus { can_enable_landlock: true, can_enable_seccomp: true, From 96083527871b79023ac0f86c069349df02bcbf51 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Wed, 22 Nov 2023 10:56:16 +0100 Subject: [PATCH 4/6] Random doc-only fix --- polkadot/zombienet_tests/functional/0001-parachains-pvf.zndsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polkadot/zombienet_tests/functional/0001-parachains-pvf.zndsl b/polkadot/zombienet_tests/functional/0001-parachains-pvf.zndsl index 46bb8bcdf72b..135999a092a7 100644 --- a/polkadot/zombienet_tests/functional/0001-parachains-pvf.zndsl +++ b/polkadot/zombienet_tests/functional/0001-parachains-pvf.zndsl @@ -54,8 +54,8 @@ one: reports histogram polkadot_pvf_preparation_time has 0 samples in buckets [" two: reports histogram polkadot_pvf_preparation_time has 0 samples in buckets ["20", "30", "60", "120", "+Inf"] within 10 seconds # Check execution time. -# There are two different timeout conditions: BACKING_EXECUTION_TIMEOUT(2s) and -# APPROVAL_EXECUTION_TIMEOUT(6s). Currently these are not differentiated by metrics +# There are two different timeout conditions: DEFAULT_BACKING_EXECUTION_TIMEOUT(2s) and +# DEFAULT_APPROVAL_EXECUTION_TIMEOUT(12s). Currently these are not differentiated by metrics # because the metrics are defined in `polkadot-node-core-pvf` which is a level below # the relevant subsystems. # That being said, we will take the simplifying assumption of testing only the From 6e2c6ca6f8ccc3e6b170e71d17ab0daa790ccac8 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Wed, 22 Nov 2023 11:18:31 +0100 Subject: [PATCH 5/6] Fix test failing on old kernel CI job --- polkadot/node/core/pvf/Cargo.toml | 1 + polkadot/node/core/pvf/tests/it/main.rs | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/polkadot/node/core/pvf/Cargo.toml b/polkadot/node/core/pvf/Cargo.toml index 27da484fe4f1..05509a5f8536 100644 --- a/polkadot/node/core/pvf/Cargo.toml +++ b/polkadot/node/core/pvf/Cargo.toml @@ -54,6 +54,7 @@ halt = { package = "test-parachain-halt", path = "../../../parachain/test-parach [target.'cfg(target_os = "linux")'.dev-dependencies] procfs = "0.16.0" rusty-fork = "0.3.0" +sc-sysinfo = { path = "../../../../substrate/client/sysinfo" } [[bench]] name = "host_prepare_rococo_runtime" diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 7770d37825e4..b53d598cd0f7 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -414,11 +414,21 @@ async fn prepare_can_run_serially() { #[cfg(all(feature = "ci-only-tests", target_os = "linux"))] #[tokio::test] async fn all_security_features_work() { + // Landlock is only available starting Linux 5.13, and we may be testing on an old kernel. + let sysinfo = sc_sysinfo::gather_sysinfo(); + // The version will look something like "5.15.0-87-generic". + let version = sysinfo.linux_kernel.unwrap(); + let version_split: Vec<&str> = version.split(".").collect(); + let major: u32 = version_split[0].parse().unwrap(); + let minor: u32 = version_split[1].parse().unwrap(); + let can_enable_landlock = if major >= 6 { true } else { minor >= 13 }; + let host = TestHost::new().await; + assert_eq!( host.security_status().await, SecurityStatus { - can_enable_landlock: true, + can_enable_landlock, can_enable_seccomp: true, can_unshare_user_namespace_and_change_root: true, } From a9c23ba2b1679c2d7ed185e785cd1fb82098d138 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Wed, 22 Nov 2023 11:40:13 +0100 Subject: [PATCH 6/6] update Cargo.lock --- Cargo.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.lock b/Cargo.lock index 697f232f9719..e90d1d1bc625 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12591,6 +12591,7 @@ dependencies = [ "rand 0.8.5", "rococo-runtime", "rusty-fork", + "sc-sysinfo", "slotmap", "sp-core", "sp-maybe-compressed-blob",