From f12082a9a69bfdd239aeed4ef7fc04b76cb9dcce Mon Sep 17 00:00:00 2001 From: Furisto <24721048+Furisto@users.noreply.github.com> Date: Sun, 21 Nov 2021 20:01:11 +0100 Subject: [PATCH 1/8] Apply resource restrictions in rootless mode --- crates/libcgroups/src/common.rs | 4 ++++ .../libcontainer/src/container/builder_impl.rs | 4 ++-- .../process/container_intermediate_process.rs | 17 ++++++++--------- crates/libcontainer/src/rootless.rs | 1 - 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/crates/libcgroups/src/common.rs b/crates/libcgroups/src/common.rs index 45f05e663..9d3e54872 100644 --- a/crates/libcgroups/src/common.rs +++ b/crates/libcgroups/src/common.rs @@ -176,6 +176,10 @@ pub fn create_cgroup_manager>( match cgroup_setup { CgroupSetup::Legacy | CgroupSetup::Hybrid => { + if systemd_cgroup { + bail!("resource control with systemd is not supported on cgroup v1"); + } + log::info!("cgroup manager V1 will be used"); Ok(Box::new(v1::manager::Manager::new(cgroup_path.into())?)) } diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 633933585..47bdb312c 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -57,7 +57,7 @@ impl<'a> ContainerBuilderImpl<'a> { let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id); let cmanager = libcgroups::common::create_cgroup_manager( &cgroups_path, - self.use_systemd, + self.use_systemd || self.rootless.is_some(), &self.container_id, )?; let process = self.spec.process().as_ref().context("No process in spec")?; @@ -142,7 +142,7 @@ impl<'a> ContainerBuilderImpl<'a> { let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id); let cmanager = libcgroups::common::create_cgroup_manager( &cgroups_path, - self.use_systemd, + self.use_systemd || self.rootless.is_some(), &self.container_id, )?; diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 547e7a9db..4bde50d3f 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -47,7 +47,7 @@ pub fn container_intermediate_process( // root in the user namespace likely is mapped to an non-priviliged user // on the parent user namespace. command.set_id(Uid::from_raw(0), Gid::from_raw(0)).context( - "Failed to configure uid and gid root in the beginning of a new user namespace", + "failed to configure uid and gid root in the beginning of a new user namespace", )?; } @@ -68,14 +68,13 @@ pub fn container_intermediate_process( // this needs to be done before we create the init process, so that the init // process will already be captured by the cgroup - if args.rootless.is_none() { - apply_cgroups( - args.cgroup_manager.as_ref(), - linux.resources().as_ref(), - args.init, - ) - .context("failed to apply cgroups")? - } + + apply_cgroups( + args.cgroup_manager.as_ref(), + linux.resources().as_ref(), + args.init, + ) + .context("failed to apply cgroups")?; // We have to record the pid of the child (container init process), since // the child will be inside the pid namespace. We can't rely on child_ready diff --git a/crates/libcontainer/src/rootless.rs b/crates/libcontainer/src/rootless.rs index 5ea61b0fd..edf711000 100644 --- a/crates/libcontainer/src/rootless.rs +++ b/crates/libcontainer/src/rootless.rs @@ -36,7 +36,6 @@ impl<'a> Rootless<'a> { if user_namespace.is_some() && user_namespace.unwrap().path().is_none() { log::debug!("rootless container should be created"); - log::warn!("resource constraints are unimplemented for rootless containers"); validate(spec).context("The spec failed to comply to rootless requirement")?; let mut rootless = Rootless::from(linux); From 190a0bad38876fe590b055c0a3f6b0060787e405 Mon Sep 17 00:00:00 2001 From: Furisto <24721048+Furisto@users.noreply.github.com> Date: Sun, 21 Nov 2021 20:50:30 +0100 Subject: [PATCH 2/8] Add session dbus connection --- crates/libcgroups/src/common.rs | 6 +++++- crates/libcgroups/src/systemd/dbus/client.rs | 9 ++++++++- crates/libcgroups/src/systemd/manager.rs | 7 +++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/crates/libcgroups/src/common.rs b/crates/libcgroups/src/common.rs index 9d3e54872..475be0c23 100644 --- a/crates/libcgroups/src/common.rs +++ b/crates/libcgroups/src/common.rs @@ -188,11 +188,15 @@ pub fn create_cgroup_manager>( if !systemd::booted() { bail!("systemd cgroup flag passed, but systemd support for managing cgroups is not available"); } - log::info!("systemd cgroup manager will be used"); + + let use_system = nix::unistd::geteuid().is_root(); + + log::info!("systemd cgroup manager with system bus {} will be used", use_system); return Ok(Box::new(systemd::manager::Manager::new( DEFAULT_CGROUP_ROOT.into(), cgroup_path.into(), container_name.into(), + use_system )?)); } log::info!("cgroup manager V2 will be used"); diff --git a/crates/libcgroups/src/systemd/dbus/client.rs b/crates/libcgroups/src/systemd/dbus/client.rs index 6576ed0d2..26f8f3dfc 100644 --- a/crates/libcgroups/src/systemd/dbus/client.rs +++ b/crates/libcgroups/src/systemd/dbus/client.rs @@ -12,11 +12,18 @@ pub struct Client { } impl Client { - pub fn new() -> Result { + /// Uses the system bus to communicate with systemd + pub fn new_system() -> Result { let conn = Connection::new_system()?; Ok(Client { conn }) } + /// Uses the session bus to communicate with systemd + pub fn new_session() -> Result { + let conn = Connection::new_session()?; + Ok(Client { conn }) + } + fn create_proxy(&self) -> Proxy<&Connection> { self.conn.with_proxy( "org.freedesktop.systemd1", diff --git a/crates/libcgroups/src/systemd/manager.rs b/crates/libcgroups/src/systemd/manager.rs index dbe7d247c..295190659 100644 --- a/crates/libcgroups/src/systemd/manager.rs +++ b/crates/libcgroups/src/systemd/manager.rs @@ -61,7 +61,7 @@ impl Display for CgroupsPath { } impl Manager { - pub fn new(root_path: PathBuf, cgroups_path: PathBuf, container_name: String) -> Result { + pub fn new(root_path: PathBuf, cgroups_path: PathBuf, container_name: String, use_system: bool) -> Result { // TODO: create the systemd unit using a dbus client. let destructured_path = Self::destructure_cgroups_path(cgroups_path)?; let (cgroups_path, parent) = Self::construct_cgroups_path(&destructured_path)?; @@ -74,7 +74,10 @@ impl Manager { container_name, unit_name: Self::get_unit_name(&destructured_path), destructured_path, - client: Client::new().context("failed to create dbus client")?, + client: match use_system { + true => Client::new_system().context("failed to create system dbus client")?, + false => Client::new_session().context("failed to create session dbus client")?, + } }) } From c6b91abf355f8f08ebbbc3af518501f8137fde76 Mon Sep 17 00:00:00 2001 From: Furisto <24721048+Furisto@users.noreply.github.com> Date: Wed, 24 Nov 2021 00:08:02 +0100 Subject: [PATCH 3/8] Define & implement trait for systemd client --- crates/libcgroups/src/systemd/dbus/client.rs | 59 ++++++++++++++++++-- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/crates/libcgroups/src/systemd/dbus/client.rs b/crates/libcgroups/src/systemd/dbus/client.rs index 26f8f3dfc..7d8aff2d9 100644 --- a/crates/libcgroups/src/systemd/dbus/client.rs +++ b/crates/libcgroups/src/systemd/dbus/client.rs @@ -3,25 +3,54 @@ use anyhow::{Context, Result}; use dbus::arg::{RefArg, Variant}; use dbus::blocking::{Connection, Proxy}; use std::collections::HashMap; +use std::path::PathBuf; use std::time::Duration; +pub trait SystemdClient { + fn is_system(&self) -> bool; + + fn start_transient_unit( + &self, + container_name: &str, + pid: u32, + parent: &str, + unit_name: &str, + ) -> Result<()>; + + fn stop_transient_unit(&self, unit_name: &str) -> Result<()>; + + fn set_unit_properties( + &self, + unit_name: &str, + properties: &HashMap<&str, Box>, + ) -> Result<()>; + + fn systemd_version(&self) -> Result; + + fn control_cgroup_root(&self) -> Result; +} + /// Client is a wrapper providing higher level API and abatraction around dbus. /// For more information see https://www.freedesktop.org/wiki/Software/systemd/dbus/ pub struct Client { conn: Connection, + system: bool, } impl Client { /// Uses the system bus to communicate with systemd pub fn new_system() -> Result { let conn = Connection::new_system()?; - Ok(Client { conn }) + Ok(Client { conn, system: true }) } /// Uses the session bus to communicate with systemd pub fn new_session() -> Result { let conn = Connection::new_session()?; - Ok(Client { conn }) + Ok(Client { + conn, + system: false, + }) } fn create_proxy(&self) -> Proxy<&Connection> { @@ -31,11 +60,17 @@ impl Client { Duration::from_millis(5000), ) } +} + +impl SystemdClient for Client { + fn is_system(&self) -> bool { + self.system + } /// start_transient_unit is a higher level API for starting a unit /// for a specific container under systemd. /// See https://www.freedesktop.org/wiki/Software/systemd/dbus for more details. - pub fn start_transient_unit( + fn start_transient_unit( &self, container_name: &str, pid: u32, @@ -77,6 +112,8 @@ impl Client { properties.push(("DefaultDependencies", Variant(Box::new(false)))); properties.push(("PIDs", Variant(Box::new(vec![pid])))); + log::debug!("START UNIT: {:?}", properties); + proxy .start_transient_unit(unit_name, "replace", properties, vec![]) .with_context(|| { @@ -88,7 +125,7 @@ impl Client { Ok(()) } - pub fn stop_transient_unit(&self, unit_name: &str) -> Result<()> { + fn stop_transient_unit(&self, unit_name: &str) -> Result<()> { let proxy = self.create_proxy(); proxy @@ -97,7 +134,7 @@ impl Client { Ok(()) } - pub fn set_unit_properties( + fn set_unit_properties( &self, unit_name: &str, properties: &HashMap<&str, Box>, @@ -115,7 +152,7 @@ impl Client { Ok(()) } - pub fn systemd_version(&self) -> Result { + fn systemd_version(&self) -> Result { let proxy = self.create_proxy(); let version = proxy @@ -130,4 +167,14 @@ impl Client { Ok(version) } + + fn control_cgroup_root(&self) -> Result { + let proxy = self.create_proxy(); + + let cgroup_root = proxy + .control_group() + .context("failed to get systemd control group")?; + PathBuf::try_from(cgroup_root) + .with_context(|| format!("parse systemd control cgroup into path")) + } } From 64fd60dda3370cb43ad8a33e8f2e2b349685e5bb Mon Sep 17 00:00:00 2001 From: Furisto <24721048+Furisto@users.noreply.github.com> Date: Wed, 24 Nov 2021 19:11:00 +0100 Subject: [PATCH 4/8] Systemd manager updates - Use systemd client to find systemd cgroup root - Add error context - Manager debug impl - Comments - Set default slice name for rootless and rootfull containers --- crates/libcgroups/src/common.rs | 9 +- crates/libcgroups/src/systemd/manager.rs | 163 +++++++++++++++++------ 2 files changed, 128 insertions(+), 44 deletions(-) diff --git a/crates/libcgroups/src/common.rs b/crates/libcgroups/src/common.rs index 475be0c23..c32bfa4c1 100644 --- a/crates/libcgroups/src/common.rs +++ b/crates/libcgroups/src/common.rs @@ -190,13 +190,16 @@ pub fn create_cgroup_manager>( } let use_system = nix::unistd::geteuid().is_root(); - - log::info!("systemd cgroup manager with system bus {} will be used", use_system); + + log::info!( + "systemd cgroup manager with system bus {} will be used", + use_system + ); return Ok(Box::new(systemd::manager::Manager::new( DEFAULT_CGROUP_ROOT.into(), cgroup_path.into(), container_name.into(), - use_system + use_system, )?)); } log::info!("cgroup manager V2 will be used"); diff --git a/crates/libcgroups/src/systemd/manager.rs b/crates/libcgroups/src/systemd/manager.rs index 295190659..39ea5ccee 100644 --- a/crates/libcgroups/src/systemd/manager.rs +++ b/crates/libcgroups/src/systemd/manager.rs @@ -2,7 +2,7 @@ #![allow(unused_variables)] use std::{ collections::HashMap, - fmt::Display, + fmt::{Debug, Display}, fs::{self}, os::unix::fs::PermissionsExt, path::Component::RootDir, @@ -18,7 +18,7 @@ use super::{ controller_type::{ControllerType, CONTROLLER_TYPES}, cpu::Cpu, cpuset::CpuSet, - dbus::client::Client, + dbus::client::{Client, SystemdClient}, memory::Memory, pids::Pids, }; @@ -32,14 +32,21 @@ const CGROUP_PROCS: &str = "cgroup.procs"; const CGROUP_CONTROLLERS: &str = "cgroup.controllers"; const CGROUP_SUBTREE_CONTROL: &str = "cgroup.subtree_control"; -/// SystemDCGroupManager is a driver for managing cgroups via systemd. pub struct Manager { + /// Root path of the cgroup hierarchy e.g. /sys/fs/cgroup root_path: PathBuf, + /// Path relative to the root path e.g. /system.slice/youki-569d5ce3afe1074769f67.scope for rootfull containers + /// and e.g. /user.slice/user-1000/user@1000.service/youki-569d5ce3afe1074769f67.scope for rootless containers cgroups_path: PathBuf, + /// Combination of root path and cgroups path full_path: PathBuf, + /// Destructured cgroups path as specified in the runtime spec e.g. system.slice:youki:569d5ce3afe1074769f67 destructured_path: CgroupsPath, + /// Name of the container e.g. 569d5ce3afe1074769f67 container_name: String, + /// Name of the systemd unit e.g. youki-569d5ce3afe1074769f67.scope unit_name: String, + /// Client for communicating with systemd client: Client, } @@ -60,11 +67,37 @@ impl Display for CgroupsPath { } } +// custom debug impl as Manager contains fields that do not implement Debug +// and therefore Debug cannot be derived +impl Debug for Manager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Manager") + .field("root_path", &self.root_path) + .field("cgroups_path", &self.cgroups_path) + .field("full_path", &self.full_path) + .field("destructured_path", &self.destructured_path) + .field("container_name", &self.container_name) + .field("unit_name", &self.unit_name) + .finish() + } +} + impl Manager { - pub fn new(root_path: PathBuf, cgroups_path: PathBuf, container_name: String, use_system: bool) -> Result { - // TODO: create the systemd unit using a dbus client. - let destructured_path = Self::destructure_cgroups_path(cgroups_path)?; - let (cgroups_path, parent) = Self::construct_cgroups_path(&destructured_path)?; + pub fn new( + root_path: PathBuf, + cgroups_path: PathBuf, + container_name: String, + use_system: bool, + ) -> Result { + let destructured_path = Self::destructure_cgroups_path(&cgroups_path) + .with_context(|| format!("failed to destructure cgroups path {:?}", cgroups_path))?; + let client = match use_system { + true => Client::new_system().context("failed to create system dbus client")?, + false => Client::new_session().context("failed to create session dbus client")?, + }; + + let (cgroups_path, parent) = Self::construct_cgroups_path(&destructured_path, &client) + .context("failed to construct cgroups path")?; let full_path = root_path.join_safely(&cgroups_path)?; Ok(Manager { @@ -74,15 +107,11 @@ impl Manager { container_name, unit_name: Self::get_unit_name(&destructured_path), destructured_path, - client: match use_system { - true => Client::new_system().context("failed to create system dbus client")?, - false => Client::new_session().context("failed to create session dbus client")?, - } + client, }) } - fn destructure_cgroups_path(cgroups_path: PathBuf) -> Result { - log::debug!("CGROUPS PATH IS {:?}", cgroups_path); + fn destructure_cgroups_path(cgroups_path: &Path) -> Result { // cgroups path may never be empty as it is defaulted to `/youki` // see 'get_cgroup_path' under utils.rs. // if cgroups_path was provided it should be of the form [slice]:[prefix]:[name], @@ -95,11 +124,11 @@ impl Manager { name = cgroups_path .strip_prefix("/youki/")? .to_str() - .ok_or_else(|| anyhow!("failed to parse cgroupsPath field"))?; + .ok_or_else(|| anyhow!("failed to parse cgroups path"))?; } else { let parts = cgroups_path .to_str() - .ok_or_else(|| anyhow!("failed to parse cgroupsPath field"))? + .ok_or_else(|| anyhow!("failed to parse cgroups path"))? .split(':') .collect::>(); parent = parts[0]; @@ -124,6 +153,33 @@ impl Manager { cgroups_path.name.clone() } + // get_cgroups_path generates a cgroups path from the one provided by the user via cgroupsPath. + // an example of the final path: "/system.slice/docker-foo.scope" + fn construct_cgroups_path( + cgroups_path: &CgroupsPath, + client: &dyn SystemdClient, + ) -> Result<(PathBuf, PathBuf)> { + let mut parent = match client.is_system() { + true => PathBuf::from("/system.slice"), + false => PathBuf::from("/user.slice"), + }; + + // if the user provided a '.slice' (as in a branch of a tree) + // we need to convert it to a filesystem path. + if !cgroups_path.parent.is_empty() { + parent = Self::expand_slice(&cgroups_path.parent)?; + } + + let systemd_root = client.control_cgroup_root()?; + let unit_name = Self::get_unit_name(cgroups_path); + let cgroups_path = systemd_root + .join_safely(&parent) + .with_context(|| format!("failed to join {:?} with {:?}", systemd_root, parent))? + .join_safely(&unit_name) + .with_context(|| format!("failed to join {:?} with {:?}", parent, unit_name))?; + Ok((cgroups_path, parent)) + } + // systemd represents slice hierarchy using `-`, so we need to follow suit when // generating the path of slice. For example, 'test-a-b.slice' becomes // '/test.slice/test-a.slice/test-a-b.slice'. @@ -144,7 +200,7 @@ impl Manager { } for component in slice_name.split('-') { if component.is_empty() { - anyhow!("Invalid slice name: {}", slice); + anyhow!("invalid slice name: {}", slice); } // Append the component to the path and to the prefix. path = format!("{}/{}{}{}", path, prefix, component, suffix); @@ -153,23 +209,6 @@ impl Manager { Ok(Path::new(&path).to_path_buf()) } - // get_cgroups_path generates a cgroups path from the one provided by the user via cgroupsPath. - // an example of the final path: "/machine.slice/docker-foo.scope" - fn construct_cgroups_path(cgroups_path: &CgroupsPath) -> Result<(PathBuf, PathBuf)> { - // the root slice is under 'machine.slice'. - let mut parent = PathBuf::from("/system.slice"); - // if the user provided a '.slice' (as in a branch of a tree) - // we need to convert it to a filesystem path. - if !cgroups_path.parent.is_empty() { - parent = Self::expand_slice(&cgroups_path.parent)?; - } - let unit_name = Self::get_unit_name(cgroups_path); - let cgroups_path = parent - .join_safely(&unit_name) - .with_context(|| format!("failed to join {:?} with {:?}", parent, unit_name))?; - Ok((cgroups_path, parent)) - } - /// create_unified_cgroup verifies sure that *each level* in the downward path from the root cgroup /// down to the cgroup_path provided by the user is a valid cgroup hierarchy, /// containing the attached controllers and that it contains the container pid. @@ -266,6 +305,10 @@ impl CgroupManager for Manager { ) })?; + let cg = self.client.control_cgroup_root().context("cgroup root")?; + log::debug!("CONTROL GROUP ROOT: {:?}", cg); + log::debug!("MANAGER {:?}", self); + Ok(()) } @@ -332,8 +375,48 @@ impl CgroupManager for Manager { #[cfg(test)] mod tests { + use crate::systemd::dbus::client::SystemdClient; + use super::*; + struct TestSystemdClient {} + + impl SystemdClient for TestSystemdClient { + fn is_system(&self) -> bool { + true + } + + fn start_transient_unit( + &self, + container_name: &str, + pid: u32, + parent: &str, + unit_name: &str, + ) -> Result<()> { + Ok(()) + } + + fn stop_transient_unit(&self, unit_name: &str) -> Result<()> { + Ok(()) + } + + fn set_unit_properties( + &self, + unit_name: &str, + properties: &HashMap<&str, Box>, + ) -> Result<()> { + Ok(()) + } + + fn systemd_version(&self) -> Result { + Ok(245) + } + + fn control_cgroup_root(&self) -> Result { + Ok(PathBuf::from("/")) + } + } + #[test] fn expand_slice_works() -> Result<()> { assert_eq!( @@ -347,11 +430,10 @@ mod tests { #[test] fn get_cgroups_path_works_with_a_complex_slice() -> Result<()> { let cgroups_path = - Manager::destructure_cgroups_path(PathBuf::from("test-a-b.slice:docker:foo")) - .expect(""); + Manager::destructure_cgroups_path(Path::new("test-a-b.slice:docker:foo")).expect(""); assert_eq!( - Manager::construct_cgroups_path(&cgroups_path)?.0, + Manager::construct_cgroups_path(&cgroups_path, &TestSystemdClient {})?.0, PathBuf::from("/test.slice/test-a.slice/test-a-b.slice/docker-foo.scope"), ); @@ -361,10 +443,10 @@ mod tests { #[test] fn get_cgroups_path_works_with_a_simple_slice() -> Result<()> { let cgroups_path = - Manager::destructure_cgroups_path(PathBuf::from("machine.slice:libpod:foo")).expect(""); + Manager::destructure_cgroups_path(Path::new("machine.slice:libpod:foo")).expect(""); assert_eq!( - Manager::construct_cgroups_path(&cgroups_path)?.0, + Manager::construct_cgroups_path(&cgroups_path, &TestSystemdClient {})?.0, PathBuf::from("/machine.slice/libpod-foo.scope"), ); @@ -373,11 +455,10 @@ mod tests { #[test] fn get_cgroups_path_works_with_scope() -> Result<()> { - let cgroups_path = - Manager::destructure_cgroups_path(PathBuf::from(":docker:foo")).expect(""); + let cgroups_path = Manager::destructure_cgroups_path(Path::new(":docker:foo")).expect(""); assert_eq!( - Manager::construct_cgroups_path(&cgroups_path)?.0, + Manager::construct_cgroups_path(&cgroups_path, &TestSystemdClient {})?.0, PathBuf::from("/system.slice/docker-foo.scope"), ); From 419284137ec62327d4cd787161439bd135302fac Mon Sep 17 00:00:00 2001 From: Furisto <24721048+Furisto@users.noreply.github.com> Date: Wed, 24 Nov 2021 19:28:03 +0100 Subject: [PATCH 5/8] Check if unprivileged user namespaces are enabled --- crates/libcontainer/src/rootless.rs | 17 +++++++++++++++++ crates/youki/src/commands/info.rs | 8 +++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/crates/libcontainer/src/rootless.rs b/crates/libcontainer/src/rootless.rs index edf711000..88240ee9a 100644 --- a/crates/libcontainer/src/rootless.rs +++ b/crates/libcontainer/src/rootless.rs @@ -2,6 +2,7 @@ use crate::{namespaces::Namespaces, utils}; use anyhow::{bail, Context, Result}; use nix::unistd::Pid; use oci_spec::runtime::{Linux, LinuxIdMapping, LinuxNamespace, LinuxNamespaceType, Mount, Spec}; +use std::fs; use std::path::Path; use std::process::Command; use std::{env, path::PathBuf}; @@ -104,6 +105,22 @@ pub fn rootless_required() -> bool { matches!(std::env::var("YOUKI_USE_ROOTLESS").as_deref(), Ok("true")) } +pub fn unprivileged_user_ns_enabled() -> Result { + let user_ns_sysctl = Path::new("/proc/sys/kernel/unprivileged_userns_clone"); + if !user_ns_sysctl.exists() { + return Ok(true); + } + + let content = + fs::read_to_string(user_ns_sysctl).context("failed to read unprivileged userns clone")?; + + match content.trim().parse::()? { + 0 => Ok(false), + 1 => Ok(true), + v => bail!("failed to parse unprivileged userns value: {}", v), + } +} + /// Validates that the spec contains the required information for /// running in rootless mode fn validate(spec: &Spec) -> Result<()> { diff --git a/crates/youki/src/commands/info.rs b/crates/youki/src/commands/info.rs index 2f9bca37f..146159853 100644 --- a/crates/youki/src/commands/info.rs +++ b/crates/youki/src/commands/info.rs @@ -3,6 +3,7 @@ use std::{collections::HashSet, fs, path::Path}; use anyhow::Result; use clap::Parser; +use libcontainer::rootless; use procfs::{CpuInfo, Meminfo}; use libcgroups::{common::CgroupSetup, v2::controller_type::ControllerType}; @@ -176,7 +177,12 @@ pub fn print_namespaces() { println!(" {:<16}enabled", "mount"); print_feature_status(&content, "CONFIG_UTS_NS", FeatureDisplay::new("uts")); print_feature_status(&content, "CONFIG_IPC_NS", FeatureDisplay::new("ipc")); - print_feature_status(&content, "CONFIG_USER_NS", FeatureDisplay::new("user")); + + let user_display = match rootless::unprivileged_user_ns_enabled() { + Ok(false) => FeatureDisplay::with_status("user", "enabled (root only)", "disabled"), + _ => FeatureDisplay::new("user"), + }; + print_feature_status(&content, "CONFIG_USER_NS", user_display); print_feature_status(&content, "CONFIG_PID_NS", FeatureDisplay::new("pid")); print_feature_status(&content, "CONFIG_NET_NS", FeatureDisplay::new("network")); // While the CONFIG_CGROUP_NS kernel feature exists, it is obsolete and should not be used. CGroup namespaces From 9cff02435bab5e61c66e56b9787a22e376babbe0 Mon Sep 17 00:00:00 2001 From: Furisto <24721048+Furisto@users.noreply.github.com> Date: Wed, 24 Nov 2021 19:42:21 +0100 Subject: [PATCH 6/8] Cleanup --- crates/libcgroups/src/common.rs | 4 ---- crates/libcgroups/src/systemd/dbus/client.rs | 4 ++-- crates/libcgroups/src/systemd/manager.rs | 4 ---- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/crates/libcgroups/src/common.rs b/crates/libcgroups/src/common.rs index c32bfa4c1..81d1b054f 100644 --- a/crates/libcgroups/src/common.rs +++ b/crates/libcgroups/src/common.rs @@ -176,10 +176,6 @@ pub fn create_cgroup_manager>( match cgroup_setup { CgroupSetup::Legacy | CgroupSetup::Hybrid => { - if systemd_cgroup { - bail!("resource control with systemd is not supported on cgroup v1"); - } - log::info!("cgroup manager V1 will be used"); Ok(Box::new(v1::manager::Manager::new(cgroup_path.into())?)) } diff --git a/crates/libcgroups/src/systemd/dbus/client.rs b/crates/libcgroups/src/systemd/dbus/client.rs index 7d8aff2d9..d2ab1e13a 100644 --- a/crates/libcgroups/src/systemd/dbus/client.rs +++ b/crates/libcgroups/src/systemd/dbus/client.rs @@ -174,7 +174,7 @@ impl SystemdClient for Client { let cgroup_root = proxy .control_group() .context("failed to get systemd control group")?; - PathBuf::try_from(cgroup_root) - .with_context(|| format!("parse systemd control cgroup into path")) + PathBuf::try_from(&cgroup_root) + .with_context(|| format!("parse systemd control cgroup {} into path", cgroup_root)) } } diff --git a/crates/libcgroups/src/systemd/manager.rs b/crates/libcgroups/src/systemd/manager.rs index 39ea5ccee..4f66c1519 100644 --- a/crates/libcgroups/src/systemd/manager.rs +++ b/crates/libcgroups/src/systemd/manager.rs @@ -305,10 +305,6 @@ impl CgroupManager for Manager { ) })?; - let cg = self.client.control_cgroup_root().context("cgroup root")?; - log::debug!("CONTROL GROUP ROOT: {:?}", cg); - log::debug!("MANAGER {:?}", self); - Ok(()) } From f92b265b80a14987af8766e9074d6d6b303d10e0 Mon Sep 17 00:00:00 2001 From: Furisto <24721048+Furisto@users.noreply.github.com> Date: Thu, 25 Nov 2021 20:25:22 +0100 Subject: [PATCH 7/8] Ensure rootless containers work on v1 --- .../process/container_intermediate_process.rs | 27 ++++++++++++------- crates/youki/src/main.rs | 5 ++++ 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 4bde50d3f..66ce9c5da 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -22,6 +22,23 @@ pub fn container_intermediate_process( let linux = spec.linux().as_ref().context("no linux in spec")?; let namespaces = Namespaces::from(linux.namespaces().as_ref()); + // this needs to be done before we create the init process, so that the init + // process will already be captured by the cgroup. It also needs to be done + // before we enter the user namespace because if a privileged user starts a + // rootless container on a cgroup v1 system we can still fullfill resource + // restrictions through the cgroup fs support (delegation through systemd is + // not supported for v1 by us). This only works if the user has not yet been + // mapped to an unprivileged user by the user namespace however. + // In addition this needs to be done before we enter the cgroup namespace as + // the cgroup of the process will form the root of the cgroup hierarchy in + // the cgroup namespace. + apply_cgroups( + args.cgroup_manager.as_ref(), + linux.resources().as_ref(), + args.init, + ) + .context("failed to apply cgroups")?; + // if new user is specified in specification, this will be true and new // namespace will be created, check // https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more @@ -66,16 +83,6 @@ pub fn container_intermediate_process( .with_context(|| format!("Failed to enter pid namespace: {:?}", pid_namespace))?; } - // this needs to be done before we create the init process, so that the init - // process will already be captured by the cgroup - - apply_cgroups( - args.cgroup_manager.as_ref(), - linux.resources().as_ref(), - args.init, - ) - .context("failed to apply cgroups")?; - // We have to record the pid of the child (container init process), since // the child will be inside the pid namespace. We can't rely on child_ready // to send us the correct pid. diff --git a/crates/youki/src/main.rs b/crates/youki/src/main.rs index 67657a56d..f1ea9fd67 100644 --- a/crates/youki/src/main.rs +++ b/crates/youki/src/main.rs @@ -112,6 +112,11 @@ fn main() -> Result<()> { eprintln!("log init failed: {:?}", e); } + log::debug!( + "started by user {} with {:?}", + nix::unistd::geteuid(), + std::env::args_os() + ); let root_path = determine_root_path(opts.root)?; let systemd_cgroup = opts.systemd_cgroup; From 1a14c43c5bec422208c889fc25d3276fc0258cf8 Mon Sep 17 00:00:00 2001 From: Furisto <24721048+Furisto@users.noreply.github.com> Date: Sun, 28 Nov 2021 20:29:18 +0100 Subject: [PATCH 8/8] Review feedback - Add cgroups path to error context - Correct spelling mistake - Update sequence diagram - Implement TryFrom for CgroupsPath --- crates/libcgroups/src/systemd/manager.rs | 86 +++++++++++-------- .../process/container_intermediate_process.rs | 2 +- docs/.drawio.svg | 74 ++++++++-------- 3 files changed, 86 insertions(+), 76 deletions(-) diff --git a/crates/libcgroups/src/systemd/manager.rs b/crates/libcgroups/src/systemd/manager.rs index 4f66c1519..4b0d9c93f 100644 --- a/crates/libcgroups/src/systemd/manager.rs +++ b/crates/libcgroups/src/systemd/manager.rs @@ -61,6 +61,42 @@ struct CgroupsPath { name: String, } +impl TryFrom<&Path> for CgroupsPath { + type Error = anyhow::Error; + + fn try_from(cgroups_path: &Path) -> Result { + // cgroups path may never be empty as it is defaulted to `/youki` + // see 'get_cgroup_path' under utils.rs. + // if cgroups_path was provided it should be of the form [slice]:[prefix]:[name], + // for example: "system.slice:docker:1234". + let mut parent = ""; + let prefix; + let name; + if cgroups_path.starts_with("/youki") { + prefix = "youki"; + name = cgroups_path + .strip_prefix("/youki/")? + .to_str() + .ok_or_else(|| anyhow!("failed to parse cgroups path {:?}", cgroups_path))?; + } else { + let parts = cgroups_path + .to_str() + .ok_or_else(|| anyhow!("failed to parse cgroups path {:?}", cgroups_path))? + .split(':') + .collect::>(); + parent = parts[0]; + prefix = parts[1]; + name = parts[2]; + } + + Ok(CgroupsPath { + parent: parent.to_owned(), + prefix: prefix.to_owned(), + name: name.to_owned(), + }) + } +} + impl Display for CgroupsPath { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}:{}:{}", self.parent, self.prefix, self.name) @@ -89,7 +125,9 @@ impl Manager { container_name: String, use_system: bool, ) -> Result { - let destructured_path = Self::destructure_cgroups_path(&cgroups_path) + let destructured_path = cgroups_path + .as_path() + .try_into() .with_context(|| format!("failed to destructure cgroups path {:?}", cgroups_path))?; let client = match use_system { true => Client::new_system().context("failed to create system dbus client")?, @@ -111,38 +149,6 @@ impl Manager { }) } - fn destructure_cgroups_path(cgroups_path: &Path) -> Result { - // cgroups path may never be empty as it is defaulted to `/youki` - // see 'get_cgroup_path' under utils.rs. - // if cgroups_path was provided it should be of the form [slice]:[prefix]:[name], - // for example: "system.slice:docker:1234". - let mut parent = ""; - let prefix; - let name; - if cgroups_path.starts_with("/youki") { - prefix = "youki"; - name = cgroups_path - .strip_prefix("/youki/")? - .to_str() - .ok_or_else(|| anyhow!("failed to parse cgroups path"))?; - } else { - let parts = cgroups_path - .to_str() - .ok_or_else(|| anyhow!("failed to parse cgroups path"))? - .split(':') - .collect::>(); - parent = parts[0]; - prefix = parts[1]; - name = parts[2]; - } - - Ok(CgroupsPath { - parent: parent.to_owned(), - prefix: prefix.to_owned(), - name: name.to_owned(), - }) - } - /// get_unit_name returns the unit (scope) name from the path provided by the user /// for example: foo:docker:bar returns in '/docker-bar.scope' fn get_unit_name(cgroups_path: &CgroupsPath) -> String { @@ -425,8 +431,9 @@ mod tests { #[test] fn get_cgroups_path_works_with_a_complex_slice() -> Result<()> { - let cgroups_path = - Manager::destructure_cgroups_path(Path::new("test-a-b.slice:docker:foo")).expect(""); + let cgroups_path = Path::new("test-a-b.slice:docker:foo") + .try_into() + .context("construct path")?; assert_eq!( Manager::construct_cgroups_path(&cgroups_path, &TestSystemdClient {})?.0, @@ -438,8 +445,9 @@ mod tests { #[test] fn get_cgroups_path_works_with_a_simple_slice() -> Result<()> { - let cgroups_path = - Manager::destructure_cgroups_path(Path::new("machine.slice:libpod:foo")).expect(""); + let cgroups_path = Path::new("machine.slice:libpod:foo") + .try_into() + .context("construct path")?; assert_eq!( Manager::construct_cgroups_path(&cgroups_path, &TestSystemdClient {})?.0, @@ -451,7 +459,9 @@ mod tests { #[test] fn get_cgroups_path_works_with_scope() -> Result<()> { - let cgroups_path = Manager::destructure_cgroups_path(Path::new(":docker:foo")).expect(""); + let cgroups_path = Path::new(":docker:foo") + .try_into() + .context("construct path")?; assert_eq!( Manager::construct_cgroups_path(&cgroups_path, &TestSystemdClient {})?.0, diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 66ce9c5da..1f265ab07 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -25,7 +25,7 @@ pub fn container_intermediate_process( // this needs to be done before we create the init process, so that the init // process will already be captured by the cgroup. It also needs to be done // before we enter the user namespace because if a privileged user starts a - // rootless container on a cgroup v1 system we can still fullfill resource + // rootless container on a cgroup v1 system we can still fulfill resource // restrictions through the cgroup fs support (delegation through systemd is // not supported for v1 by us). This only works if the user has not yet been // mapped to an unprivileged user by the user namespace however. diff --git a/docs/.drawio.svg b/docs/.drawio.svg index c2fcf5cc9..505da318f 100644 --- a/docs/.drawio.svg +++ b/docs/.drawio.svg @@ -1,4 +1,4 @@ - + @@ -115,13 +115,13 @@ - - - + + + -
+
@@ -133,18 +133,18 @@
- + fork(2) - - - + + + -
+
@@ -154,16 +154,16 @@
- + send identifier mapping request - + -
+
unshare(CLONE_NEWUSER) @@ -171,18 +171,18 @@
- + unshare(CLONE_NEWUSER) - + - + -
+
write uid mapping @@ -190,16 +190,16 @@
- + write uid mapping - + -
+
write gid mapping @@ -207,18 +207,18 @@
- + write gid mapping - - - + + + -
+
@@ -228,7 +228,7 @@
- + send mapping written @@ -410,13 +410,13 @@ - + - + -
+
setup cgroup @@ -424,17 +424,17 @@
- + setup cgroup - - + + -
+
unshare(CLONE_NEWPID) @@ -442,16 +442,16 @@
- + unshare(CLONE_NEWPID) - + -
+
set uid and gid @@ -459,7 +459,7 @@
- + set uid and gid