diff --git a/Cargo.lock b/Cargo.lock index 20e9ac841d..f037b78092 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1942,6 +1942,7 @@ dependencies = [ "libc", "libcgroups", "libseccomp", + "nc", "nix 0.27.1", "oci-spec", "once_cell", @@ -2217,6 +2218,15 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nc" +version = "0.8.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83c88ca23498aaa86177921d95ade67290248f9ef71f7416dc47d07cdc3c72a1" +dependencies = [ + "cc", +] + [[package]] name = "nix" version = "0.26.2" diff --git a/crates/libcontainer/Cargo.toml b/crates/libcontainer/Cargo.toml index abc4b917cc..553f9de346 100644 --- a/crates/libcontainer/Cargo.toml +++ b/crates/libcontainer/Cargo.toml @@ -53,6 +53,7 @@ regex = "1.10.2" thiserror = "1.0.50" tracing = { version = "0.1.40", features = ["attributes"] } safe-path = "0.1.0" +nc = "0.8.18" [dev-dependencies] oci-spec = { version = "~0.6.4", features = ["proptests", "runtime"] } diff --git a/crates/libcontainer/src/container/tenant_builder.rs b/crates/libcontainer/src/container/tenant_builder.rs index 4adcc0e76f..954f9cee34 100644 --- a/crates/libcontainer/src/container/tenant_builder.rs +++ b/crates/libcontainer/src/container/tenant_builder.rs @@ -4,7 +4,7 @@ use nix::unistd::{self, close, pipe2, read, Pid}; use oci_spec::runtime::{ Capabilities as SpecCapabilities, Capability as SpecCapability, LinuxBuilder, LinuxCapabilities, LinuxCapabilitiesBuilder, LinuxNamespace, LinuxNamespaceBuilder, - LinuxNamespaceType, Process, ProcessBuilder, Spec, + LinuxNamespaceType, LinuxSchedulerPolicy, Process, ProcessBuilder, Spec, }; use procfs::process::Namespace; @@ -222,6 +222,56 @@ impl TenantContainerBuilder { } } } + + if let Some(sc) = process.scheduler() { + let policy = sc.policy(); + if let Some(nice) = sc.nice() { + // https://man7.org/linux/man-pages/man2/sched_setattr.2.html#top_of_page + if (*policy != LinuxSchedulerPolicy::SchedBatch || *policy != LinuxSchedulerPolicy::SchedOther) + && *nice < -20 && *nice > 19 { + tracing::error!(?nice, "invalid scheduler.nice: '{}'", nice); + Err(ErrInvalidSpec::Scheduler)?; + } + } + if let Some(priority) = sc.priority() { + if *priority != 0 + && (*policy != LinuxSchedulerPolicy::SchedFifo + && *policy != LinuxSchedulerPolicy::SchedRr) + { + tracing::error!(?policy,"scheduler.priority can only be specified for SchedFIFO or SchedRR policy"); + Err(ErrInvalidSpec::Scheduler)?; + } + } + if *policy != LinuxSchedulerPolicy::SchedDeadline { + if let Some(runtime) = sc.runtime() { + if *runtime != 0 { + tracing::error!( + ?runtime, + "scheduler runtime can only be specified for SchedDeadline policy" + ); + Err(ErrInvalidSpec::Scheduler)?; + } + } + if let Some(deadline) = sc.deadline() { + if *deadline != 0 { + tracing::error!( + ?deadline, + "scheduler deadline can only be specified for SchedDeadline policy" + ); + Err(ErrInvalidSpec::Scheduler)?; + } + } + if let Some(period) = sc.period() { + if *period != 0 { + tracing::error!( + ?period, + "scheduler period can only be specified for SchedDeadline policy" + ); + Err(ErrInvalidSpec::Scheduler)?; + } + } + } + } } utils::validate_spec_for_new_user_ns(spec)?; diff --git a/crates/libcontainer/src/error.rs b/crates/libcontainer/src/error.rs index cda150addd..f0555c9626 100644 --- a/crates/libcontainer/src/error.rs +++ b/crates/libcontainer/src/error.rs @@ -92,4 +92,6 @@ pub enum ErrInvalidSpec { AppArmorNotEnabled, #[error("invalid io priority or class.")] IoPriority, + #[error("invalid scheduler config for process")] + Scheduler, } diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index f55c04ec8c..085ca229a4 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -7,13 +7,18 @@ use crate::{ capabilities, hooks, namespaces::Namespaces, process::channel, rootfs::RootFS, tty, user_ns::UserNamespaceConfig, utils, }; +use nc; use nix::mount::MsFlags; use nix::sched::CloneFlags; use nix::sys::stat::Mode; use nix::unistd::setsid; use nix::unistd::{self, Gid, Uid}; -use oci_spec::runtime::{IOPriorityClass, LinuxIOPriority, LinuxNamespaceType, Spec, User}; +use oci_spec::runtime::{ + IOPriorityClass, LinuxIOPriority, LinuxNamespaceType, LinuxSchedulerFlag, LinuxSchedulerPolicy, + Scheduler, Spec, User, +}; use std::collections::HashMap; +use std::mem; use std::os::unix::io::AsRawFd; use std::{ env, fs, @@ -74,6 +79,8 @@ pub enum InitProcessError { WorkloadValidation(#[from] workload::ExecutorValidationError), #[error("invalid io priority class: {0}")] IoPriorityClass(String), + #[error("call exec sched_setattr error: {0}")] + SchedSetattr(String), } type Result = std::result::Result; @@ -288,6 +295,8 @@ pub fn container_init_process( set_io_priority(syscall.as_ref(), proc.io_priority())?; + setup_scheduler(proc.scheduler())?; + // set up tty if specified if let Some(csocketfd) = args.console_socket { tty::setup_console(&csocketfd).map_err(|err| { @@ -741,6 +750,58 @@ fn set_io_priority(syscall: &dyn Syscall, io_priority_op: &Option) -> Result<()> { + if let Some(sc) = sc_op { + let policy: u32 = match sc.policy() { + LinuxSchedulerPolicy::SchedOther => 0, + LinuxSchedulerPolicy::SchedFifo => 1, + LinuxSchedulerPolicy::SchedRr => 2, + LinuxSchedulerPolicy::SchedBatch => 3, + LinuxSchedulerPolicy::SchedIso => 4, + LinuxSchedulerPolicy::SchedIdle => 5, + LinuxSchedulerPolicy::SchedDeadline => 6, + }; + let mut flags_value: u64 = 0; + if let Some(flags) = sc.flags() { + for flag in flags { + match flag { + LinuxSchedulerFlag::SchedResetOnFork => flags_value |= 0x01, + LinuxSchedulerFlag::SchedFlagReclaim => flags_value |= 0x02, + LinuxSchedulerFlag::SchedFlagDLOverrun => flags_value |= 0x04, + LinuxSchedulerFlag::SchedFlagKeepPolicy => flags_value |= 0x08, + LinuxSchedulerFlag::SchedFlagKeepParams => flags_value |= 0x10, + LinuxSchedulerFlag::SchedFlagUtilClampMin => flags_value |= 0x20, + LinuxSchedulerFlag::SchedFlagUtilClampMax => flags_value |= 0x40, + } + } + } + let mut a = nc::sched_attr_t { + size: mem::size_of::().try_into().unwrap(), + sched_policy: policy, + sched_flags: flags_value, + sched_nice: sc.nice().unwrap_or(0), + sched_priority: sc.priority().unwrap_or(0) as u32, + sched_runtime: sc.runtime().unwrap_or(0), + sched_deadline: sc.deadline().unwrap_or(0), + sched_period: sc.period().unwrap_or(0), + sched_util_min: 0, + sched_util_max: 0, + }; + unsafe { + let result = nc::sched_setattr(0, &mut a, 0); + match result { + Ok(_) => {} + Err(err) => { + tracing::error!(?err, "error setting scheduler"); + Err(InitProcessError::SchedSetattr(err.to_string()))?; + } + } + }; + } + Ok(()) +} + #[cfg(feature = "libseccomp")] fn sync_seccomp( fd: Option, diff --git a/tests/integration_test/src/main.rs b/tests/integration_test/src/main.rs index 351d090804..1dd67597d1 100644 --- a/tests/integration_test/src/main.rs +++ b/tests/integration_test/src/main.rs @@ -14,6 +14,7 @@ use crate::tests::seccomp::get_seccomp_test; use crate::tests::seccomp_notify::get_seccomp_notify_test; use crate::tests::sysctl::get_sysctl_test; use crate::tests::tlb::get_tlb_test; +use crate::tests::process::get_scheduler_test; use crate::utils::support::{set_runtime_path, set_runtimetest_path}; use anyhow::{Context, Result}; use clap::Parser; @@ -103,6 +104,7 @@ fn main() -> Result<()> { let mounts_recursive = get_mounts_recursive_test(); let intel_rdt = get_intel_rdt_test(); let sysctl = get_sysctl_test(); + let scheduler = get_scheduler_test(); tm.add_test_group(Box::new(cl)); tm.add_test_group(Box::new(cc)); @@ -123,6 +125,7 @@ fn main() -> Result<()> { tm.add_test_group(Box::new(mounts_recursive)); tm.add_test_group(Box::new(intel_rdt)); tm.add_test_group(Box::new(sysctl)); + tm.add_test_group(Box::new(scheduler)); tm.add_cleanup(Box::new(cgroups::cleanup_v1)); tm.add_cleanup(Box::new(cgroups::cleanup_v2)); diff --git a/tests/integration_test/src/tests/mod.rs b/tests/integration_test/src/tests/mod.rs index c0d738e226..7beb5c0f1c 100644 --- a/tests/integration_test/src/tests/mod.rs +++ b/tests/integration_test/src/tests/mod.rs @@ -12,3 +12,4 @@ pub mod seccomp; pub mod seccomp_notify; pub mod sysctl; pub mod tlb; +pub mod process; diff --git a/tests/integration_test/src/tests/process/mod.rs b/tests/integration_test/src/tests/process/mod.rs new file mode 100644 index 0000000000..f769759130 --- /dev/null +++ b/tests/integration_test/src/tests/process/mod.rs @@ -0,0 +1,4 @@ +mod scheduler; + +pub use scheduler::get_scheduler_test; + diff --git a/tests/integration_test/src/tests/process/scheduler.rs b/tests/integration_test/src/tests/process/scheduler.rs new file mode 100644 index 0000000000..4b9cddb4d6 --- /dev/null +++ b/tests/integration_test/src/tests/process/scheduler.rs @@ -0,0 +1,53 @@ +use anyhow::{Context, Result}; +use oci_spec::runtime::{ProcessBuilder, Spec, SpecBuilder, LinuxSchedulerPolicy, SchedulerBuilder}; +use test_framework::{test_result, Test, TestGroup, TestResult}; + +use crate::utils::test_inside_container; + +////////// ANCHOR: create_spec policy: u32 +fn create_spec(policy: LinuxSchedulerPolicy) -> Result { + let sc = SchedulerBuilder::default() + .policy(policy) + .nice(0i32) + .build() + .unwrap(); + SpecBuilder::default() + .process( + ProcessBuilder::default() + .args( + ["runtimetest", "hello_world"] + .iter() + .map(|s| s.to_string()) + .collect::>(), + ) + .scheduler(sc) + .build()?, + ) + .build() + .context("failed to create spec") +} + +fn scheduler_policy_other_test() -> TestResult { + let spec = test_result!(create_spec(LinuxSchedulerPolicy::SchedOther)); + test_inside_container(spec, &|_| { + Ok(()) + }) +} + +fn scheduler_policy_batch_test() -> TestResult { + let spec = test_result!(create_spec(LinuxSchedulerPolicy::SchedBatch)); + test_inside_container(spec, &|_| { + Ok(()) + }) +} + +pub fn get_scheduler_test() -> TestGroup { + let mut scheduler_policy_group = TestGroup::new("set_scheduler_policy"); + let policy_fifo_test = Test::new("policy_fifo", Box::new(scheduler_policy_other_test)); + let policy_rr_test = Test::new("policy_rr", Box::new(scheduler_policy_batch_test)); + + scheduler_policy_group.add(vec![Box::new(policy_fifo_test)]); + scheduler_policy_group.add(vec![Box::new(policy_rr_test)]); + + scheduler_policy_group +}