Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PVF: Fix unshare "no such file or directory" error #2426

Merged
merged 6 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitlab/pipeline/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ test-linux-stable:
--locked \
--release \
--no-fail-fast \
--features try-runtime,experimental \
--features try-runtime,experimental,ci-only-tests \
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty sure this used to be here!

Copy link
Contributor Author

@mrcnski mrcnski Nov 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@paritytech/ci How can we remove this feature for test-linux-oldkernel-stable? It's expected that this test fails on old kernels.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

considering that the feature is called ci-only-tests and not smth like ci-only-tests-new-kernel, maybe we could instead check the kernel version in the test (using uname) and skip the test if it's too old

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great idea @alindima, done.

--partition count:${CI_NODE_INDEX}/${CI_NODE_TOTAL}
# Upload tests results to Elasticsearch
- echo "Upload test results to Elasticsearch"
Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions polkadot/node/core/pvf/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ halt = { package = "test-parachain-halt", path = "../../../parachain/test-parach
[target.'cfg(target_os = "linux")'.dev-dependencies]
procfs = "0.16.0"
rusty-fork = "0.3.0"
sc-sysinfo = { path = "../../../../substrate/client/sysinfo" }

[[bench]]
name = "host_prepare_rococo_runtime"
Expand Down
2 changes: 1 addition & 1 deletion polkadot/node/core/pvf/common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ pub mod tests {
}

/// Status of security features on the current system.
#[derive(Debug, Clone, Default)]
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct SecurityStatus {
/// Whether the landlock features we use are fully available on this system.
pub can_enable_landlock: bool,
Expand Down
5 changes: 2 additions & 3 deletions polkadot/node/core/pvf/src/artifacts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -499,8 +499,7 @@ mod tests {

#[tokio::test]
async fn remove_stale_cache_on_startup() {
let cache_dir = crate::worker_intf::tmppath("test-cache").await.unwrap();
fs::create_dir_all(&cache_dir).unwrap();
let cache_dir = tempfile::Builder::new().prefix("test-cache-").tempdir().unwrap();

// invalid prefix
create_rand_artifact(&cache_dir, "");
Expand Down Expand Up @@ -529,7 +528,7 @@ mod tests {

assert_eq!(fs::read_dir(&cache_dir).unwrap().count(), 7);

let artifacts = Artifacts::new_and_prune(&cache_dir).await;
let artifacts = Artifacts::new_and_prune(cache_dir.path()).await;

assert_eq!(fs::read_dir(&cache_dir).unwrap().count(), 1);
assert_eq!(artifacts.len(), 1);
Expand Down
4 changes: 2 additions & 2 deletions polkadot/node/core/pvf/src/execute/worker_intf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ where
// Cheaply create a hard link to the artifact. The artifact is always at a known location in the
// worker cache, and the child can't access any other artifacts or gain any information from the
// original filename.
let link_path = worker_dir::execute_artifact(&worker_dir.path);
let link_path = worker_dir::execute_artifact(worker_dir.path());
if let Err(err) = tokio::fs::hard_link(artifact_path, link_path).await {
gum::warn!(
target: LOG_TARGET,
Expand All @@ -292,7 +292,7 @@ where
}
}

let worker_dir_path = worker_dir.path.clone();
let worker_dir_path = worker_dir.path().to_owned();
let outcome = f(worker_dir).await;

// Try to clear the worker dir.
Expand Down
9 changes: 6 additions & 3 deletions polkadot/node/core/pvf/src/host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use crate::{
artifacts::{ArtifactId, ArtifactPathId, ArtifactState, Artifacts},
execute::{self, PendingExecutionRequest},
metrics::Metrics,
prepare, security, Priority, ValidationError, LOG_TARGET,
prepare, security, Priority, SecurityStatus, ValidationError, LOG_TARGET,
};
use always_assert::never;
use futures::{
Expand Down Expand Up @@ -70,6 +70,8 @@ pub(crate) type PrecheckResultSender = oneshot::Sender<PrecheckResult>;
#[derive(Clone)]
pub struct ValidationHost {
to_host_tx: mpsc::Sender<ToHost>,
/// Available security features, detected by the host during startup.
pub security_status: SecurityStatus,
}

impl ValidationHost {
Expand Down Expand Up @@ -216,7 +218,7 @@ pub async fn start(

let (to_host_tx, to_host_rx) = mpsc::channel(10);

let validation_host = ValidationHost { to_host_tx };
let validation_host = ValidationHost { to_host_tx, security_status: security_status.clone() };

let (to_prepare_pool, from_prepare_pool, run_prepare_pool) = prepare::start_pool(
metrics.clone(),
Expand Down Expand Up @@ -978,7 +980,8 @@ pub(crate) mod tests {

fn host_handle(&mut self) -> ValidationHost {
let to_host_tx = self.to_host_tx.take().unwrap();
ValidationHost { to_host_tx }
let security_status = Default::default();
ValidationHost { to_host_tx, security_status }
}

async fn poll_and_recv_result<T>(&mut self, result_rx: oneshot::Receiver<T>) -> T
Expand Down
4 changes: 2 additions & 2 deletions polkadot/node/core/pvf/src/prepare/worker_intf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ where
{
// Create the tmp file here so that the child doesn't need any file creation rights. This will
// be cleared at the end of this function.
let tmp_file = worker_dir::prepare_tmp_artifact(&worker_dir.path);
let tmp_file = worker_dir::prepare_tmp_artifact(worker_dir.path());
if let Err(err) = tokio::fs::File::create(&tmp_file).await {
gum::warn!(
target: LOG_TARGET,
Expand All @@ -333,7 +333,7 @@ where
}
};

let worker_dir_path = worker_dir.path.clone();
let worker_dir_path = worker_dir.path().to_owned();
let outcome = f(tmp_file, stream, worker_dir).await;

// Try to clear the worker dir.
Expand Down
19 changes: 8 additions & 11 deletions polkadot/node/core/pvf/src/security.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ const SECURE_MODE_ANNOUNCEMENT: &'static str =

/// Run checks for supported security features.
///
/// # Return
/// # Returns
///
/// Returns the set of security features that we were able to enable. If an error occurs while
/// enabling a security feature we set the corresponding status to `false`.
Expand Down Expand Up @@ -158,18 +158,15 @@ async fn check_can_unshare_user_namespace_and_change_root(
) -> SecureModeResult {
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
let cache_dir_tempdir =
crate::worker_intf::tmppath_in("check-can-unshare", cache_path)
.await
.map_err(
|err|
SecureModeError::CannotUnshareUserNamespaceAndChangeRoot(
format!("could not create a temporary directory in {:?}: {}", cache_path, err)
)
)?;
let cache_dir_tempdir = tempfile::Builder::new()
.prefix("check-can-unshare-")
.tempdir_in(cache_path)
.map_err(|err| SecureModeError::CannotUnshareUserNamespaceAndChangeRoot(
format!("could not create a temporary directory in {:?}: {}", cache_path, err)
))?;
match tokio::process::Command::new(prepare_worker_program_path)
.arg("--check-can-unshare-user-namespace-and-change-root")
.arg(cache_dir_tempdir)
.arg(cache_dir_tempdir.path())
.output()
.await
{
Expand Down
105 changes: 47 additions & 58 deletions polkadot/node/core/pvf/src/worker_intf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ pub async fn spawn_with_program_path(

with_transient_socket_path(debug_id, |socket_path| {
let socket_path = socket_path.to_owned();
let worker_dir_path = worker_dir.path().to_owned();

async move {
let listener = UnixListener::bind(&socket_path).map_err(|err| {
Expand All @@ -91,7 +92,7 @@ pub async fn spawn_with_program_path(
&program_path,
&extra_args,
&socket_path,
&worker_dir.path,
&worker_dir_path,
security_status,
)
.map_err(|err| {
Expand All @@ -100,15 +101,14 @@ pub async fn spawn_with_program_path(
%debug_id,
?program_path,
?extra_args,
?worker_dir.path,
?worker_dir_path,
?socket_path,
"cannot spawn a worker: {:?}",
err,
);
SpawnErr::ProcessSpawn
})?;

let worker_dir_path = worker_dir.path.clone();
futures::select! {
accept_result = listener.accept().fuse() => {
let (stream, _) = accept_result.map_err(|err| {
Expand Down Expand Up @@ -150,7 +150,42 @@ where
F: FnOnce(&Path) -> Fut,
Fut: futures::Future<Output = Result<T, SpawnErr>> + 'static,
{
let socket_path = tmppath(&format!("pvf-host-{}", debug_id))
/// Returns a path under [`std::env::temp_dir`]. The path name will start with the given prefix.
///
/// There is only a certain number of retries. If exceeded this function will give up and return
/// an error.
pub async fn tmppath(prefix: &str) -> io::Result<PathBuf> {
mrcnski marked this conversation as resolved.
Show resolved Hide resolved
fn make_tmppath(prefix: &str, dir: &Path) -> PathBuf {
use rand::distributions::Alphanumeric;

const DESCRIMINATOR_LEN: usize = 10;

let mut buf = Vec::with_capacity(prefix.len() + DESCRIMINATOR_LEN);
buf.extend(prefix.as_bytes());
buf.extend(rand::thread_rng().sample_iter(&Alphanumeric).take(DESCRIMINATOR_LEN));

let s = std::str::from_utf8(&buf)
.expect("the string is collected from a valid utf-8 sequence; qed");

let mut path = dir.to_owned();
path.push(s);
path
}

const NUM_RETRIES: usize = 50;

let dir = std::env::temp_dir();
for _ in 0..NUM_RETRIES {
let tmp_path = make_tmppath(prefix, &dir);
if !tmp_path.exists() {
return Ok(tmp_path)
}
}

Err(io::Error::new(io::ErrorKind::Other, "failed to create a temporary path"))
}

let socket_path = tmppath(&format!("pvf-host-{}-", debug_id))
.await
.map_err(|_| SpawnErr::TmpPath)?;
let result = f(&socket_path).await;
Expand All @@ -162,46 +197,6 @@ where
result
}

/// Returns a path under the given `dir`. The path name will start with the given prefix.
///
/// There is only a certain number of retries. If exceeded this function will give up and return an
/// error.
pub async fn tmppath_in(prefix: &str, dir: &Path) -> io::Result<PathBuf> {
fn make_tmppath(prefix: &str, dir: &Path) -> PathBuf {
use rand::distributions::Alphanumeric;

const DESCRIMINATOR_LEN: usize = 10;

let mut buf = Vec::with_capacity(prefix.len() + DESCRIMINATOR_LEN);
buf.extend(prefix.as_bytes());
buf.extend(rand::thread_rng().sample_iter(&Alphanumeric).take(DESCRIMINATOR_LEN));

let s = std::str::from_utf8(&buf)
.expect("the string is collected from a valid utf-8 sequence; qed");

let mut path = dir.to_owned();
path.push(s);
path
}

const NUM_RETRIES: usize = 50;

for _ in 0..NUM_RETRIES {
let tmp_path = make_tmppath(prefix, dir);
if !tmp_path.exists() {
return Ok(tmp_path)
}
}

Err(io::Error::new(io::ErrorKind::Other, "failed to create a temporary path"))
}

/// The same as [`tmppath_in`], but uses [`std::env::temp_dir`] as the directory.
pub async fn tmppath(prefix: &str) -> io::Result<PathBuf> {
let temp_dir = std::env::temp_dir();
tmppath_in(prefix, &temp_dir).await
}

/// A struct that represents an idle worker.
///
/// This struct is supposed to be used as a token that is passed by move into a subroutine that
Expand All @@ -224,8 +219,6 @@ pub struct IdleWorker {
pub enum SpawnErr {
/// Cannot obtain a temporary path location.
TmpPath,
/// An FS error occurred.
Fs(String),
/// Cannot bind the socket to the given path.
Bind,
/// An error happened during accepting a connection to the socket.
Expand Down Expand Up @@ -419,26 +412,22 @@ pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result<Vec<u8>
/// ```
#[derive(Debug)]
pub struct WorkerDir {
pub path: PathBuf,
tempdir: tempfile::TempDir,
}

impl WorkerDir {
/// Creates a new, empty worker dir with a random name in the given cache dir.
pub async fn new(debug_id: &'static str, cache_dir: &Path) -> Result<Self, SpawnErr> {
let prefix = format!("worker-dir-{}-", debug_id);
let path = tmppath_in(&prefix, cache_dir).await.map_err(|_| SpawnErr::TmpPath)?;
tokio::fs::create_dir(&path)
.await
.map_err(|err| SpawnErr::Fs(err.to_string()))?;
Ok(Self { path })
let tempdir = tempfile::Builder::new()
.prefix(&prefix)
.tempdir_in(cache_dir)
.map_err(|_| SpawnErr::TmpPath)?;
Ok(Self { tempdir })
}
}

// Try to clean up the temporary worker dir at the end of the worker's lifetime. It should be wiped
// on startup, but we make a best effort not to leave it around.
impl Drop for WorkerDir {
fn drop(&mut self) {
let _ = std::fs::remove_dir_all(&self.path);
pub fn path(&self) -> &Path {
self.tempdir.path()
}
}

Expand Down
32 changes: 32 additions & 0 deletions polkadot/node/core/pvf/tests/it/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

use assert_matches::assert_matches;
use parity_scale_codec::Encode as _;
#[cfg(all(feature = "ci-only-tests", target_os = "linux"))]
use polkadot_node_core_pvf::SecurityStatus;
use polkadot_node_core_pvf::{
start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, PrepareError,
PrepareJobKind, PvfPrepData, ValidationError, ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
Expand Down Expand Up @@ -122,6 +124,11 @@ impl TestHost {
.unwrap();
result_rx.await.unwrap()
}

#[cfg(all(feature = "ci-only-tests", target_os = "linux"))]
async fn security_status(&self) -> SecurityStatus {
self.host.lock().await.security_status.clone()
}
}

#[tokio::test]
Expand Down Expand Up @@ -402,3 +409,28 @@ async fn prepare_can_run_serially() {
// Prepare a different wasm blob to prevent skipping work.
let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), Default::default()).await.unwrap();
}

// CI machines should be able to enable all the security features.
#[cfg(all(feature = "ci-only-tests", target_os = "linux"))]
#[tokio::test]
async fn all_security_features_work() {
// Landlock is only available starting Linux 5.13, and we may be testing on an old kernel.
let sysinfo = sc_sysinfo::gather_sysinfo();
// The version will look something like "5.15.0-87-generic".
let version = sysinfo.linux_kernel.unwrap();
let version_split: Vec<&str> = version.split(".").collect();
let major: u32 = version_split[0].parse().unwrap();
let minor: u32 = version_split[1].parse().unwrap();
let can_enable_landlock = if major >= 6 { true } else { minor >= 13 };

let host = TestHost::new().await;

assert_eq!(
host.security_status().await,
SecurityStatus {
can_enable_landlock,
can_enable_seccomp: true,
can_unshare_user_namespace_and_change_root: true,
}
);
}
4 changes: 2 additions & 2 deletions polkadot/zombienet_tests/functional/0001-parachains-pvf.zndsl
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ one: reports histogram polkadot_pvf_preparation_time has 0 samples in buckets ["
two: reports histogram polkadot_pvf_preparation_time has 0 samples in buckets ["20", "30", "60", "120", "+Inf"] within 10 seconds

# Check execution time.
# There are two different timeout conditions: BACKING_EXECUTION_TIMEOUT(2s) and
# APPROVAL_EXECUTION_TIMEOUT(6s). Currently these are not differentiated by metrics
# There are two different timeout conditions: DEFAULT_BACKING_EXECUTION_TIMEOUT(2s) and
# DEFAULT_APPROVAL_EXECUTION_TIMEOUT(12s). Currently these are not differentiated by metrics
# because the metrics are defined in `polkadot-node-core-pvf` which is a level below
# the relevant subsystems.
# That being said, we will take the simplifying assumption of testing only the
Expand Down