From 6162618e7a30e4f6ad594ff110c9097cbfe54d82 Mon Sep 17 00:00:00 2001 From: Eric Fang Date: Wed, 19 Jul 2023 06:09:43 -0700 Subject: [PATCH] Refactor the libcgroups interface (#2168) * fix notify_listener - fix the name to notify listener - fix the structure to be clone-able Signed-off-by: yihuaf * changed the libcgroup creation interface Changed the libcgroup creation interface to use config struct rather than variables. The creation will also own/consume the config struct. In this way, we don't have to create the cgroup manager upfront. Instead, we can delay the creation of cgroup manager in the process when it is needed. Signed-off-by: yihuaf * Add a notify listener test Signed-off-by: yihuaf * fix clippy Signed-off-by: yihuaf * fix spellcheck Signed-off-by: yihuaf --------- Signed-off-by: yihuaf --- crates/libcgroups/src/common.rs | 35 +++++++----- crates/libcgroups/src/systemd/manager.rs | 2 +- crates/libcgroups/src/v1/manager.rs | 4 +- .../src/container/builder_impl.rs | 31 ++++++----- .../src/container/container_delete.rs | 9 +-- .../src/container/container_events.rs | 9 +-- .../src/container/container_kill.rs | 29 ++++++---- .../src/container/container_pause.rs | 8 ++- .../src/container/container_resume.rs | 8 ++- crates/libcontainer/src/notify_socket.rs | 55 ++++++++++++++++++- crates/libcontainer/src/process/args.rs | 11 ++-- .../src/process/container_init_process.rs | 13 ++--- .../process/container_intermediate_process.rs | 4 +- crates/youki/src/commands/mod.rs | 11 ++-- 14 files changed, 152 insertions(+), 77 deletions(-) diff --git a/crates/libcgroups/src/common.rs b/crates/libcgroups/src/common.rs index aba574f58..a0d20424d 100644 --- a/crates/libcgroups/src/common.rs +++ b/crates/libcgroups/src/common.rs @@ -316,17 +316,22 @@ pub enum CreateCgroupSetupError { Systemd(#[from] systemd::manager::SystemdManagerError), } -pub fn create_cgroup_manager>( - cgroup_path: P, - systemd_cgroup: bool, - container_name: &str, +#[derive(Clone)] +pub struct CgroupConfig { + pub cgroup_path: PathBuf, + pub systemd_cgroup: bool, + pub container_name: String, +} + +pub fn create_cgroup_manager( + config: CgroupConfig, ) -> Result { let cgroup_setup = get_cgroup_setup().map_err(|err| match err { GetCgroupSetupError::WrappedIo(err) => CreateCgroupSetupError::WrappedIo(err), GetCgroupSetupError::NonDefault => CreateCgroupSetupError::NonDefault, GetCgroupSetupError::FailedToDetect => CreateCgroupSetupError::FailedToDetect, })?; - let cgroup_path = cgroup_path.into(); + let cgroup_path = config.cgroup_path.as_path(); match cgroup_setup { CgroupSetup::Legacy | CgroupSetup::Hybrid => { @@ -334,17 +339,17 @@ pub fn create_cgroup_manager>( } CgroupSetup::Unified => { // ref https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#cgroups-path - if cgroup_path.is_absolute() || !systemd_cgroup { + if cgroup_path.is_absolute() || !config.systemd_cgroup { return Ok(create_v2_cgroup_manager(cgroup_path)?.any()); } - Ok(create_systemd_cgroup_manager(cgroup_path, container_name)?.any()) + Ok(create_systemd_cgroup_manager(cgroup_path, config.container_name.as_str())?.any()) } } } #[cfg(feature = "v1")] fn create_v1_cgroup_manager( - cgroup_path: PathBuf, + cgroup_path: &Path, ) -> Result { tracing::info!("cgroup manager V1 will be used"); v1::manager::Manager::new(cgroup_path) @@ -352,29 +357,29 @@ fn create_v1_cgroup_manager( #[cfg(not(feature = "v1"))] fn create_v1_cgroup_manager( - _cgroup_path: PathBuf, + _cgroup_path: &Path, ) -> Result { Err(v1::manager::V1ManagerError::NotEnabled) } #[cfg(feature = "v2")] fn create_v2_cgroup_manager( - cgroup_path: PathBuf, + cgroup_path: &Path, ) -> Result { tracing::info!("cgroup manager V2 will be used"); - v2::manager::Manager::new(DEFAULT_CGROUP_ROOT.into(), cgroup_path) + v2::manager::Manager::new(DEFAULT_CGROUP_ROOT.into(), cgroup_path.to_owned()) } #[cfg(not(feature = "v2"))] fn create_v2_cgroup_manager( - _cgroup_path: PathBuf, + _cgroup_path: &Path, ) -> Result { Err(v2::manager::V2ManagerError::NotEnabled) } #[cfg(feature = "systemd")] fn create_systemd_cgroup_manager( - cgroup_path: PathBuf, + cgroup_path: &Path, container_name: &str, ) -> Result { if !systemd::booted() { @@ -391,7 +396,7 @@ fn create_systemd_cgroup_manager( ); systemd::manager::Manager::new( DEFAULT_CGROUP_ROOT.into(), - cgroup_path, + cgroup_path.to_owned(), container_name.into(), use_system, ) @@ -399,7 +404,7 @@ fn create_systemd_cgroup_manager( #[cfg(not(feature = "systemd"))] fn create_systemd_cgroup_manager( - _cgroup_path: PathBuf, + _cgroup_path: &Path, _container_name: &str, ) -> Result { Err(systemd::manager::SystemdManagerError::NotEnabled) diff --git a/crates/libcgroups/src/systemd/manager.rs b/crates/libcgroups/src/systemd/manager.rs index 21a2e60d8..286b2fe34 100644 --- a/crates/libcgroups/src/systemd/manager.rs +++ b/crates/libcgroups/src/systemd/manager.rs @@ -180,7 +180,7 @@ impl Manager { container_name: String, use_system: bool, ) -> Result { - let mut destructured_path = cgroups_path.as_path().try_into()?; + let mut destructured_path: CgroupsPath = cgroups_path.as_path().try_into()?; ensure_parent_unit(&mut destructured_path, use_system); let client = match use_system { diff --git a/crates/libcgroups/src/v1/manager.rs b/crates/libcgroups/src/v1/manager.rs index e1376a883..022ffbbcb 100644 --- a/crates/libcgroups/src/v1/manager.rs +++ b/crates/libcgroups/src/v1/manager.rs @@ -82,10 +82,10 @@ pub enum V1ManagerError { impl Manager { /// Constructs a new cgroup manager with cgroups_path being relative to the root of the subsystem - pub fn new(cgroup_path: PathBuf) -> Result { + pub fn new(cgroup_path: &Path) -> Result { let mut subsystems = HashMap::::new(); for subsystem in CONTROLLERS { - if let Ok(subsystem_path) = Self::get_subsystem_path(&cgroup_path, subsystem) { + if let Ok(subsystem_path) = Self::get_subsystem_path(cgroup_path, subsystem) { subsystems.insert(*subsystem, subsystem_path); } else { tracing::warn!("cgroup {} not supported on this system", subsystem); diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 96d90266b..9122fc9f3 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -73,11 +73,11 @@ impl<'a> ContainerBuilderImpl<'a> { &self.container_id, self.rootless.is_some(), ); - let cmanager = libcgroups::common::create_cgroup_manager( - cgroups_path, - self.use_systemd || self.rootless.is_some(), - &self.container_id, - )?; + let cgroup_config = libcgroups::common::CgroupConfig { + cgroup_path: cgroups_path, + systemd_cgroup: self.use_systemd || self.rootless.is_some(), + container_name: self.container_id.to_owned(), + }; let process = self .spec .process() @@ -93,8 +93,10 @@ impl<'a> ContainerBuilderImpl<'a> { // Need to create the notify socket before we pivot root, since the unix // domain socket used here is outside of the rootfs of container. During // exec, need to create the socket before we enter into existing mount - // namespace. - let notify_socket: NotifyListener = NotifyListener::new(&self.notify_path)?; + // namespace. We also need to create to socket before entering into the + // user namespace in the case that the path is located in paths only + // root can access. + let notify_listener = NotifyListener::new(&self.notify_path)?; // If Out-of-memory score adjustment is set in specification. set the score // value for the current process check @@ -139,11 +141,11 @@ impl<'a> ContainerBuilderImpl<'a> { spec: self.spec, rootfs: &self.rootfs, console_socket: self.console_socket, - notify_socket, + notify_listener, preserve_fds: self.preserve_fds, container: &self.container, rootless: &self.rootless, - cgroup_manager: cmanager, + cgroup_config, detached: self.detached, executor_manager: &self.executor_manager, }; @@ -184,11 +186,12 @@ impl<'a> ContainerBuilderImpl<'a> { &self.container_id, self.rootless.is_some(), ); - let cmanager = libcgroups::common::create_cgroup_manager( - cgroups_path, - self.use_systemd || self.rootless.is_some(), - &self.container_id, - )?; + let cmanager = + libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { + cgroup_path: cgroups_path, + systemd_cgroup: self.use_systemd || self.rootless.is_some(), + container_name: self.container_id.to_string(), + })?; let mut errors = Vec::new(); diff --git a/crates/libcontainer/src/container/container_delete.rs b/crates/libcontainer/src/container/container_delete.rs index 0e25a5456..0322fd392 100644 --- a/crates/libcontainer/src/container/container_delete.rs +++ b/crates/libcontainer/src/container/container_delete.rs @@ -81,11 +81,12 @@ impl Container { // remove the cgroup created for the container // check https://man7.org/linux/man-pages/man7/cgroups.7.html // creating and removing cgroups section for more information on cgroups - let use_systemd = self.systemd(); let cmanager = libcgroups::common::create_cgroup_manager( - &config.cgroup_path, - use_systemd, - self.id(), + libcgroups::common::CgroupConfig { + cgroup_path: config.cgroup_path.to_owned(), + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + }, )?; cmanager.remove().map_err(|err| { tracing::error!(cgroup_path = ?config.cgroup_path, "failed to remove cgroup due to: {err:?}"); diff --git a/crates/libcontainer/src/container/container_events.rs b/crates/libcontainer/src/container/container_events.rs index ca4124bc0..ed253be3a 100644 --- a/crates/libcontainer/src/container/container_events.rs +++ b/crates/libcontainer/src/container/container_events.rs @@ -33,11 +33,12 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); - let cgroup_manager = - libcgroups::common::create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path, + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; match stats { true => { let stats = cgroup_manager.stats()?; diff --git a/crates/libcontainer/src/container/container_kill.rs b/crates/libcontainer/src/container/container_kill.rs index faf7b89fe..6b5d08540 100644 --- a/crates/libcontainer/src/container/container_kill.rs +++ b/crates/libcontainer/src/container/container_kill.rs @@ -1,6 +1,6 @@ use super::{Container, ContainerStatus}; use crate::{error::LibcontainerError, signal::Signal}; -use libcgroups::common::{create_cgroup_manager, get_cgroup_setup, CgroupManager}; +use libcgroups::common::{get_cgroup_setup, CgroupManager}; use nix::sys::signal::{self}; impl Container { @@ -78,10 +78,14 @@ impl Container { match get_cgroup_setup()? { libcgroups::common::CgroupSetup::Legacy | libcgroups::common::CgroupSetup::Hybrid => { - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); - let cmanger = create_cgroup_manager(cgroups_path, use_systemd, self.id())?; - cmanger.freeze(libcgroups::common::FreezerState::Thawed)?; + let cmanager = libcgroups::common::create_cgroup_manager( + libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path, + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + }, + )?; + cmanager.freeze(libcgroups::common::FreezerState::Thawed)?; } libcgroups::common::CgroupSetup::Unified => {} } @@ -91,11 +95,14 @@ impl Container { fn kill_all_processes>(&self, signal: S) -> Result<(), LibcontainerError> { let signal = signal.into().into_raw(); - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); - let cmanger = create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + let cmanager = + libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path, + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; - if let Err(e) = cmanger.freeze(libcgroups::common::FreezerState::Frozen) { + if let Err(e) = cmanager.freeze(libcgroups::common::FreezerState::Frozen) { tracing::warn!( err = ?e, id = ?self.id(), @@ -103,7 +110,7 @@ impl Container { ); } - let pids = cmanger.get_all_pids()?; + let pids = cmanager.get_all_pids()?; pids.iter() .try_for_each(|&pid| { tracing::debug!("kill signal {} to {}", signal, pid); @@ -117,7 +124,7 @@ impl Container { } }) .map_err(LibcontainerError::OtherSyscall)?; - if let Err(err) = cmanger.freeze(libcgroups::common::FreezerState::Thawed) { + if let Err(err) = cmanager.freeze(libcgroups::common::FreezerState::Thawed) { tracing::warn!( err = ?err, id = ?self.id(), diff --git a/crates/libcontainer/src/container/container_pause.rs b/crates/libcontainer/src/container/container_pause.rs index 462f3d475..8e7248e46 100644 --- a/crates/libcontainer/src/container/container_pause.rs +++ b/crates/libcontainer/src/container/container_pause.rs @@ -32,10 +32,12 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); let cmanager = - libcgroups::common::create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path, + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; cmanager.freeze(FreezerState::Frozen)?; tracing::debug!("saving paused status"); diff --git a/crates/libcontainer/src/container/container_resume.rs b/crates/libcontainer/src/container/container_resume.rs index 9e3dae5d8..544b8e8e5 100644 --- a/crates/libcontainer/src/container/container_resume.rs +++ b/crates/libcontainer/src/container/container_resume.rs @@ -34,10 +34,12 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); let cmanager = - libcgroups::common::create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path, + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; // resume the frozen container cmanager.freeze(FreezerState::Thawed)?; diff --git a/crates/libcontainer/src/notify_socket.rs b/crates/libcontainer/src/notify_socket.rs index 9daf16b89..64837ca87 100644 --- a/crates/libcontainer/src/notify_socket.rs +++ b/crates/libcontainer/src/notify_socket.rs @@ -1,6 +1,7 @@ use nix::unistd::{self, close}; use std::env; use std::io::prelude::*; +use std::os::fd::FromRawFd; use std::os::unix::io::AsRawFd; use std::os::unix::net::{UnixListener, UnixStream}; use std::path::{Path, PathBuf}; @@ -9,7 +10,7 @@ pub const NOTIFY_FILE: &str = "notify.sock"; #[derive(Debug, thiserror::Error)] pub enum NotifyListenerError { - #[error("failed to chdir while creating notify socket")] + #[error("failed to chdir {path} while creating notify socket: {source}")] Chdir { source: nix::Error, path: PathBuf }, #[error("invalid path: {0}")] InvalidPath(PathBuf), @@ -43,6 +44,7 @@ pub struct NotifyListener { impl NotifyListener { pub fn new(socket_path: &Path) -> Result { + tracing::debug!(?socket_path, "create notify listener"); // Unix domain socket has a maximum length of 108, different from // normal path length of 255. Due to how docker create the path name // to the container working directory, there is a high chance that @@ -56,6 +58,7 @@ impl NotifyListener { .file_name() .ok_or_else(|| NotifyListenerError::InvalidPath(socket_path.to_owned()))?; let cwd = env::current_dir().map_err(NotifyListenerError::GetCwd)?; + tracing::debug!(?cwd, "the cwd to create the notify socket"); unistd::chdir(workdir).map_err(|e| NotifyListenerError::Chdir { source: e, path: workdir.to_owned(), @@ -93,6 +96,23 @@ impl NotifyListener { } } +impl Clone for NotifyListener { + fn clone(&self) -> Self { + let fd = self.socket.as_raw_fd(); + // This is safe because we just duplicate a valid fd. Theoretically, to + // truly clone a unix listener, we have to use dup(2) to duplicate the + // fd, and then use from_raw_fd to create a new UnixListener. However, + // for our purposes, fd is just an integer to pass around for the same + // socket. Our main usage is to pass the notify_listener across process + // boundary. Since fd tables are cloned during clone/fork calls, this + // should be safe to use, as long as we be careful with not closing the + // same fd in different places. If we observe an issue, we will switch + // to `dup`. + let socket = unsafe { UnixListener::from_raw_fd(fd) }; + Self { socket } + } +} + pub struct NotifySocket { path: PathBuf, } @@ -135,3 +155,36 @@ impl NotifySocket { Ok(()) } } + +#[cfg(test)] +mod test { + use tempfile::tempdir; + + use super::*; + + #[test] + /// Test that the listener can be cloned and function correctly. This test + /// also serves as a test for the normal case. + fn test_notify_listener_clone() { + let tempdir = tempdir().unwrap(); + let socket_path = tempdir.path().join("notify.sock"); + // listener needs to be created first because it will create the socket. + let listener = NotifyListener::new(&socket_path).unwrap(); + let mut socket = NotifySocket::new(socket_path.clone()); + // This is safe without race because the unix domain socket is already + // created. It is OK for the socket to send the start notification + // before the listener wait is called. + let thread_handle = std::thread::spawn({ + move || { + // We clone the listener and listen on the cloned listener to + // make sure the cloned fd functions correctly. + let cloned_listener = listener.clone(); + cloned_listener.wait_for_container_start().unwrap(); + cloned_listener.close().unwrap(); + } + }); + + socket.notify_container_start().unwrap(); + thread_handle.join().unwrap(); + } +} diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index e59d75f46..f616c7c59 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -1,12 +1,13 @@ -use libcgroups::common::AnyCgroupManager; +use libcgroups::common::CgroupConfig; use oci_spec::runtime::Spec; use std::os::unix::prelude::RawFd; use std::path::PathBuf; +use crate::container::Container; +use crate::notify_socket::NotifyListener; use crate::rootless::Rootless; use crate::syscall::syscall::SyscallType; use crate::workload::ExecutorManager; -use crate::{container::Container, notify_socket::NotifyListener}; #[derive(Debug, Copy, Clone)] pub enum ContainerType { @@ -26,15 +27,15 @@ pub struct ContainerArgs<'a> { /// Socket to communicate the file descriptor of the ptty pub console_socket: Option, /// The Unix Domain Socket to communicate container start - pub notify_socket: NotifyListener, + pub notify_listener: NotifyListener, /// File descriptors preserved/passed to the container init process. pub preserve_fds: i32, /// Container state pub container: &'a Option, /// Options for rootless containers pub rootless: &'a Option>, - /// Cgroup Manager - pub cgroup_manager: AnyCgroupManager, + /// Cgroup Manager Config + pub cgroup_config: CgroupConfig, /// If the container is to be run in detached mode pub detached: bool, /// Manage the functions that actually run on the container diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 6e38521a0..62acdc596 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -343,6 +343,7 @@ pub fn container_init_process( let hooks = spec.hooks().as_ref(); let container = args.container.as_ref(); let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; + let notify_listener = &args.notify_listener; setsid().map_err(|err| { tracing::error!(?err, "failed to setsid to create a session"); @@ -657,13 +658,11 @@ pub fn container_init_process( })?; // listing on the notify socket for container start command - args.notify_socket - .wait_for_container_start() - .map_err(|err| { - tracing::error!(?err, "failed to wait for container start"); - err - })?; - args.notify_socket.close().map_err(|err| { + notify_listener.wait_for_container_start().map_err(|err| { + tracing::error!(?err, "failed to wait for container start"); + err + })?; + notify_listener.close().map_err(|err| { tracing::error!(?err, "failed to close notify socket"); err })?; diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index b8ab33d4c..1d8b8918d 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -43,6 +43,8 @@ pub fn container_intermediate_process( let spec = &args.spec; let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; + let cgroup_manager = + libcgroups::common::create_cgroup_manager(args.cgroup_config.to_owned()).unwrap(); // this needs to be done before we create the init process, so that the init // process will already be captured by the cgroup. It also needs to be done @@ -55,7 +57,7 @@ pub fn container_intermediate_process( // the cgroup of the process will form the root of the cgroup hierarchy in // the cgroup namespace. apply_cgroups( - &args.cgroup_manager, + &cgroup_manager, linux.resources().as_ref(), matches!(args.container_type, ContainerType::InitContainer), )?; diff --git a/crates/youki/src/commands/mod.rs b/crates/youki/src/commands/mod.rs index 0c763c529..d99a0ec57 100644 --- a/crates/youki/src/commands/mod.rs +++ b/crates/youki/src/commands/mod.rs @@ -59,12 +59,11 @@ fn create_cgroup_manager>( container_id: &str, ) -> Result { let container = load_container(root_path, container_id)?; - let cgroups_path = container.spec()?.cgroup_path; - let systemd_cgroup = container.systemd(); - Ok(libcgroups::common::create_cgroup_manager( - cgroups_path, - systemd_cgroup, - container.id(), + libcgroups::common::CgroupConfig { + cgroup_path: container.spec()?.cgroup_path, + systemd_cgroup: container.systemd(), + container_name: container.id().to_string(), + }, )?) }