From 9895a6df3b3a4cc1501467c1711e21aa401f8cc4 Mon Sep 17 00:00:00 2001 From: Fraser Tweedale Date: Wed, 16 Jun 2021 15:30:42 +1000 Subject: [PATCH] chown cgroup to process uid in container namespace Delegating cgroups to the container enables more complex workloads, including systemd-based workloads. The OCI runtime-spec was recently updated to explicitly admit such delegation, through specification of cgroup ownership semantics: https://github.com/opencontainers/runtime-spec/pull/1123 Pursuant to the updated OCI runtime-spec, change the ownership of the container's cgroup directory and particular files therein, when using cgroups v2 and when the cgroupfs is to be mounted read/write. As a result of this change, systemd workloads can run in isolated user namespaces on OpenShift when the sandbox's cgroupfs is mounted read/write. It might be possible to implement this feature in other cgroup managers, but that work is deferred. Signed-off-by: Fraser Tweedale --- libcontainer/cgroups/systemd/v2.go | 39 +++++++++++++++ libcontainer/configs/cgroup_linux.go | 6 +++ libcontainer/specconv/spec_linux.go | 43 +++++++++++++++++ tests/integration/cgroup_delegation.bats | 61 ++++++++++++++++++++++++ 4 files changed, 149 insertions(+) create mode 100644 tests/integration/cgroup_delegation.bats diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index 55273b722c1..5acda9a95cc 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -3,8 +3,10 @@ package systemd import ( + "bufio" "fmt" "math" + "os" "path/filepath" "strconv" "strings" @@ -291,9 +293,46 @@ func (m *unifiedManager) Apply(pid int) error { if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil { return err } + + if c.OwnerUID != nil { + filesToChown, err := cgroupFilesToChown() + if err != nil { + return err + } + + for _, v := range filesToChown { + err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1) + if err != nil { + return err + } + } + } + return nil } +// The kernel exposes a list of files that should be chowned to the delegate +// uid in /sys/kernel/cgroup/delegate. If the file is not present +// (Linux < 4.15), use the initial values mentioned in cgroups(7). +func cgroupFilesToChown() ([]string, error) { + filesToChown := []string{"."} // the directory itself must be chowned + const cgroupDelegateFile = "/sys/kernel/cgroup/delegate" + f, err := os.Open(cgroupDelegateFile) + if err == nil { + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + filesToChown = append(filesToChown, scanner.Text()) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err) + } + } else { + filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads") + } + return filesToChown, nil +} + func (m *unifiedManager) Destroy() error { if m.cgroups.Paths != nil { return nil diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go index 5ea9d940cef..0f49da95ca2 100644 --- a/libcontainer/configs/cgroup_linux.go +++ b/libcontainer/configs/cgroup_linux.go @@ -39,6 +39,12 @@ type Cgroup struct { // derived from org.systemd.property.xxx annotations. // Ignored unless systemd is used for managing cgroups. SystemdProps []systemdDbus.Property `json:"-"` + // The host UID that should own the cgroup, or nil to accept + // the default ownership. This should only be set when the + // cgroupfs is to be mounted read/write. + // Not all cgroup manager implementations support changing + // the ownership. + OwnerUID *int `json:"owner_uid,omitempty"` } type Resources struct { diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 8474769c940..f243fa63cac 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -309,6 +309,49 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { } } } + + // Set the host UID that should own the container's cgroup. + // This must be performed after setupUserNamespace, so that + // config.HostRootUID() returns the correct result. + // + // Only set it if the container will have its own cgroup + // namespace and the cgroupfs will be mounted read/write. + // + hasCgroupNS := config.Namespaces.Contains(configs.NEWCGROUP) && config.Namespaces.PathOf(configs.NEWCGROUP) == "" + hasRwCgroupfs := false + if hasCgroupNS { + for _, m := range config.Mounts { + if m.Source == "cgroup" && filepath.Clean(m.Destination) == "/sys/fs/cgroup" && (m.Flags&unix.MS_RDONLY) == 0 { + hasRwCgroupfs = true + break + } + } + } + processUid := 0 + if spec.Process != nil { + // Chown the cgroup to the UID running the process, + // which is not necessarily UID 0 in the container + // namespace (e.g., an unprivileged UID in the host + // user namespace). + processUid = int(spec.Process.User.UID) + } + if hasCgroupNS && hasRwCgroupfs { + ownerUid, err := config.HostUID(processUid) + // There are two error cases; we can ignore both. + // + // 1. uidMappings is unset. Either there is no user + // namespace (fine), or it is an error (which is + // checked elsewhere). + // + // 2. The user is unmapped in the user namespace. This is an + // unusual configuration and might be an error. But it too + // will be checked elsewhere, so we can ignore it here. + // + if err == nil { + config.Cgroups.OwnerUID = &ownerUid + } + } + if spec.Process != nil { config.OomScoreAdj = spec.Process.OOMScoreAdj config.NoNewPrivileges = spec.Process.NoNewPrivileges diff --git a/tests/integration/cgroup_delegation.bats b/tests/integration/cgroup_delegation.bats new file mode 100644 index 00000000000..db1407ce3b1 --- /dev/null +++ b/tests/integration/cgroup_delegation.bats @@ -0,0 +1,61 @@ +#!/usr/bin/env bats + +load helpers + +function teardown() { + teardown_bundle +} + +function setup() { + requires root cgroups_v2 systemd + + setup_busybox + + # chown test temp dir to allow host user to read it + chown 100000 "$ROOT" + + # chown rootfs to allow host user to mkdir mount points + chown 100000 "$ROOT"/bundle/rootfs + + set_cgroups_path + + # configure a user namespace + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] + ' +} + +@test "runc exec (cgroup v2, ro cgroupfs, new cgroupns) does not chown cgroup" { + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown + [ "$status" -eq 0 ] + + runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup" + [ "$status" -eq 0 ] + [ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user +} + +@test "runc exec (cgroup v2, rw cgroupfs, inh cgroupns) does not chown cgroup" { + set_cgroup_mount_writable + + # inherit cgroup namespace (remove cgroup from namespaces list) + update_config '.linux.namespaces |= map(select(.type != "cgroup"))' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown + [ "$status" -eq 0 ] + + runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup" + [ "$status" -eq 0 ] + [ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user +} + +@test "runc exec (cgroup v2, rw cgroupfs, new cgroupns) does chown cgroup" { + set_cgroup_mount_writable + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown + [ "$status" -eq 0 ] + + runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup" + [ "$status" -eq 0 ] + [ "$output" = "root" ] # /sys/fs/cgroup owned by root (of user namespace) +}