From 5503bc5d18bd13f8fe7705ce493a0a557a88090f Mon Sep 17 00:00:00 2001 From: David Trudgian Date: Fri, 27 Aug 2021 12:48:14 -0500 Subject: [PATCH] cgroups v2 support (apply from v1/OCI spec) Support execution on a cgroups v2 unified hierarchy system, by using the `containerd/cgroups/v2` translation of the `opencontainers/runtime-spec` `LinuxResources` structure (which we use for cgroups config toml files) into v2 Resources. Device limits are implemented with the eBPF translation. This will mean that existing v1 / OCI style cgroups configuration files can be used on a system configured for the unified hierarchy without an ugly failure. Also fixes failures with the `oci` commands on these systems. Note that we don't yet support the Unified attribute from LinuxResources, which is a free-form map[string][string] for expressing cgroups v2 configuration natively. Fixes #60 --- CHANGELOG.md | 7 + go.sum | 2 + internal/app/singularity/oci_update_linux.go | 7 +- internal/pkg/cgroups/cgroups_linux.go | 154 ------------- internal/pkg/cgroups/cgroups_linux_test.go | 198 ---------------- internal/pkg/cgroups/config_linux.go | 11 +- .../cgroups/example/cgroups-no-hugetlb.toml | 1 + internal/pkg/cgroups/example/cgroups.toml | 2 +- internal/pkg/cgroups/manager_linux.go | 144 ++++++++++++ internal/pkg/cgroups/manager_linux_test.go | 67 ++++++ internal/pkg/cgroups/managerv1_linux.go | 165 +++++++++++++ internal/pkg/cgroups/managerv1_linux_test.go | 112 +++++++++ internal/pkg/cgroups/managerv2_linux.go | 217 ++++++++++++++++++ internal/pkg/cgroups/managerv2_linux_test.go | 122 ++++++++++ internal/pkg/instance/instance_linux.go | 3 +- .../pkg/runtime/engine/config/oci/config.go | 43 +++- .../pkg/runtime/engine/oci/cleanup_linux.go | 6 +- .../pkg/runtime/engine/oci/config_linux.go | 30 +-- .../pkg/runtime/engine/oci/create_linux.go | 97 ++++---- .../pkg/runtime/engine/oci/prepare_linux.go | 17 +- .../engine/singularity/cleanup_linux.go | 4 +- .../engine/singularity/container_linux.go | 17 +- .../engine/singularity/prepare_linux.go | 19 +- .../engine/singularity/process_linux.go | 6 + internal/pkg/test/tool/require/require.go | 30 ++- internal/pkg/util/fs/mount/mount_linux.go | 3 +- 26 files changed, 1026 insertions(+), 458 deletions(-) delete mode 100644 internal/pkg/cgroups/cgroups_linux.go delete mode 100644 internal/pkg/cgroups/cgroups_linux_test.go create mode 100644 internal/pkg/cgroups/manager_linux.go create mode 100644 internal/pkg/cgroups/manager_linux_test.go create mode 100644 internal/pkg/cgroups/managerv1_linux.go create mode 100644 internal/pkg/cgroups/managerv1_linux_test.go create mode 100644 internal/pkg/cgroups/managerv2_linux.go create mode 100644 internal/pkg/cgroups/managerv2_linux_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index b45488b4f7..a38f71350b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,11 +61,18 @@ capabilities. `--nvccli` is not currently supported in the hybrid fakeroot (setuid install + `--fakeroot`) workflow. Please see documentation for more details. +- The `--apply-cgroups` flag can be used to apply cgroups resource and device + restrictions on a system using the v2 unified cgroups hierarchy. The resource + restrictions must still be specified in the v1 / OCI format, which will be + translated into v2 cgroups resource restrictions, and eBPF device + restrictions. ### Bug fixes - Fix regression when files `source`d from `%environment` contain `\` escaped shell builtins (fixes issue with `source` of conda profile.d script). +- The `oci` commands will operate on systems that use the v2 unified cgroups + hierarchy. ## v3.8.2 \[2021-08-19\] diff --git a/go.sum b/go.sum index ab0a3c4d36..8e9cdb8352 100644 --- a/go.sum +++ b/go.sum @@ -169,6 +169,7 @@ github.com/cilium/ebpf v0.0.0-20200110133405-4032b1d8aae3/go.mod h1:MA5e5Lr8slmE github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775/go.mod h1:7cR51M8ViRLIdUjrmSXlK9pkrsDlLHbO8jiB8X8JnOc= github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs= github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= +github.com/cilium/ebpf v0.6.2 h1:iHsfF/t4aW4heW2YKfeHrVPGdtYTL4C4KocpM8KTSnI= github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= @@ -362,6 +363,7 @@ github.com/fatih/color v1.12.0 h1:mRhaKNwANqRgUBGKmnI5ZxEk7QXmjQeCcuYFMX2bfcc= github.com/fatih/color v1.12.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM= github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc= github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= +github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY= github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= diff --git a/internal/app/singularity/oci_update_linux.go b/internal/app/singularity/oci_update_linux.go index 7b978345fd..6912146e1a 100644 --- a/internal/app/singularity/oci_update_linux.go +++ b/internal/app/singularity/oci_update_linux.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2019, Sylabs Inc. All rights reserved. +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. // This software is licensed under a 3-clause BSD license. Please consult the // LICENSE.md file distributed with the sources of this project regarding your // rights to use or distribute this software. @@ -35,7 +35,10 @@ func OciUpdate(containerID string, args *OciArgs) error { } resources := &specs.LinuxResources{} - manager := &cgroups.Manager{Pid: state.State.Pid} + manager, err := cgroups.GetManagerFromPid(state.State.Pid) + if err != nil { + return fmt.Errorf("failed to get cgroups manager: %v", err) + } if args.FromFile == "-" { reader = os.Stdin diff --git a/internal/pkg/cgroups/cgroups_linux.go b/internal/pkg/cgroups/cgroups_linux.go deleted file mode 100644 index 1eab857909..0000000000 --- a/internal/pkg/cgroups/cgroups_linux.go +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (c) 2018, Sylabs Inc. All rights reserved. -// This software is licensed under a 3-clause BSD license. Please consult the -// LICENSE.md file distributed with the sources of this project regarding your -// rights to use or distribute this software. - -package cgroups - -import ( - "encoding/json" - "fmt" - "path/filepath" - "strings" - - "github.com/containerd/cgroups" - specs "github.com/opencontainers/runtime-spec/specs-go" -) - -// Manager manage container cgroup resources restriction -type Manager struct { - Path string - Pid int - cgroup cgroups.Cgroup -} - -func readSpecFromFile(path string) (spec specs.LinuxResources, err error) { - conf, err := LoadConfig(path) - if err != nil { - return - } - - // convert TOML structures to OCI JSON structures - data, err := json.Marshal(conf) - if err != nil { - return - } - - if err = json.Unmarshal(data, &spec); err != nil { - return - } - - return -} - -// GetCgroupRootPath returns cgroup root path -func (m *Manager) GetCgroupRootPath() string { - if m.cgroup == nil { - return "" - } - - for _, sub := range m.cgroup.Subsystems() { - processes, err := m.cgroup.Processes(sub.Name(), false) - if len(processes) == 0 || err != nil { - continue - } - process := processes[0] - cgroupPath := strings.Split(process.Path, string(sub.Name()))[0] - return filepath.Clean(cgroupPath) - } - - return "" -} - -// ApplyFromSpec applies cgroups resources restriction from OCI specification -func (m *Manager) ApplyFromSpec(spec *specs.LinuxResources) (err error) { - var path cgroups.Path - - if !filepath.IsAbs(m.Path) { - return fmt.Errorf("cgroup path must be an absolute path") - } - - path = cgroups.StaticPath(m.Path) - - s := spec - if s == nil { - s = &specs.LinuxResources{} - } - - // creates cgroup - m.cgroup, err = cgroups.New(cgroups.V1, path, s) - if err != nil { - return err - } - - if err := m.cgroup.Add(cgroups.Process{Pid: m.Pid}); err != nil { - return err - } - - return -} - -// ApplyFromFile applies cgroups resources restriction from TOML configuration -// file -func (m *Manager) ApplyFromFile(path string) error { - spec, err := readSpecFromFile(path) - if err != nil { - return err - } - return m.ApplyFromSpec(&spec) -} - -func (m *Manager) loadFromPid() (err error) { - if m.Pid == 0 { - return fmt.Errorf("no process ID specified") - } - path := cgroups.PidPath(m.Pid) - m.cgroup, err = cgroups.Load(cgroups.V1, path) - return -} - -// UpdateFromSpec updates cgroups resources restriction from OCI specification -func (m *Manager) UpdateFromSpec(spec *specs.LinuxResources) (err error) { - if m.cgroup == nil { - if err = m.loadFromPid(); err != nil { - return - } - } - err = m.cgroup.Update(spec) - return -} - -// UpdateFromFile updates cgroups resources restriction from TOML configuration -func (m *Manager) UpdateFromFile(path string) error { - spec, err := readSpecFromFile(path) - if err != nil { - return err - } - return m.UpdateFromSpec(&spec) -} - -// Remove removes resources restriction for current managed process -func (m *Manager) Remove() error { - // deletes subgroup - return m.cgroup.Delete() -} - -// Pause suspends all processes inside the container -func (m *Manager) Pause() error { - if m.cgroup == nil { - if err := m.loadFromPid(); err != nil { - return err - } - } - return m.cgroup.Freeze() -} - -// Resume resumes all processes that have been previously paused -func (m *Manager) Resume() error { - if m.cgroup == nil { - if err := m.loadFromPid(); err != nil { - return err - } - } - return m.cgroup.Thaw() -} diff --git a/internal/pkg/cgroups/cgroups_linux_test.go b/internal/pkg/cgroups/cgroups_linux_test.go deleted file mode 100644 index 5e055a877f..0000000000 --- a/internal/pkg/cgroups/cgroups_linux_test.go +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (c) 2018, Sylabs Inc. All rights reserved. -// This software is licensed under a 3-clause BSD license. Please consult the -// LICENSE.md file distributed with the sources of this project regarding your -// rights to use or distribute this software. - -package cgroups - -import ( - "bufio" - "fmt" - "io/ioutil" - "os" - "os/exec" - "path/filepath" - "strconv" - "strings" - "testing" - - "github.com/sylabs/singularity/internal/pkg/test" -) - -func readIntFromFile(path string) (int64, error) { - file, err := os.Open(path) - if err != nil { - return 0, err - } - defer file.Close() - - scanner := bufio.NewScanner(file) - for scanner.Scan() { - return strconv.ParseInt(scanner.Text(), 10, 64) - } - - return 0, fmt.Errorf("no data found") -} - -func TestCgroups(t *testing.T) { - test.EnsurePrivilege(t) - - cmd := exec.Command("/bin/cat") - pipe, err := cmd.StdinPipe() - if err != nil { - t.Fatal(err) - } - - if err := cmd.Start(); err != nil { - t.Fatal(err) - } - - pid := cmd.Process.Pid - strPid := strconv.Itoa(pid) - path := filepath.Join("/singularity", strPid) - - manager := &Manager{Pid: pid, Path: path} - - cgroupsToml := "example/cgroups.toml" - // Some systems, e.g. ppc64le may not have a 2MB page size, so don't - // apply a 2MB hugetlb limit if that's the case. - _, err = os.Stat("/sys/fs/cgroup/hugetlb/hugetlb.2MB.limit_in_bytes") - if os.IsNotExist(err) { - t.Log("No hugetlb.2MB.limit_in_bytes - using alternate cgroups test file") - cgroupsToml = "example/cgroups-no-hugetlb.toml" - } - - if err := manager.ApplyFromFile(cgroupsToml); err != nil { - t.Fatal(err) - } - defer manager.Remove() - - rootPath := manager.GetCgroupRootPath() - if rootPath == "" { - t.Fatalf("can't determine cgroups root path, is cgroups enabled ?") - } - - cpuShares := filepath.Join(rootPath, "cpu", path, "cpu.shares") - - i, err := readIntFromFile(cpuShares) - if err != nil { - t.Errorf("failed to read %s: %s", cpuShares, err) - } - if i != 1024 { - t.Errorf("cpu shares should be equal to 1024") - } - - content := []byte("[cpu]\nshares = 512") - tmpfile, err := ioutil.TempFile("", "cgroups") - if err != nil { - t.Fatal(err) - } - - defer os.Remove(tmpfile.Name()) - - if _, err := tmpfile.Write(content); err != nil { - t.Fatal(err) - } - if err := tmpfile.Close(); err != nil { - t.Fatal(err) - } - - // test update/load from PID - manager = &Manager{Pid: pid} - - if err := manager.UpdateFromFile(tmpfile.Name()); err != nil { - t.Fatal(err) - } - i, err = readIntFromFile(cpuShares) - if err != nil { - t.Errorf("failed to read %s: %s", cpuShares, err) - } - if i != 512 { - t.Errorf("cpu shares should be equal to 512") - } - - pipe.Close() - - cmd.Wait() -} - -func TestPauseResume(t *testing.T) { - test.EnsurePrivilege(t) - - manager := &Manager{} - if err := manager.Pause(); err == nil { - t.Errorf("unexpected success with PID 0") - } - if err := manager.Resume(); err == nil { - t.Errorf("unexpected success with PID 0") - } - - cmd := exec.Command("/bin/cat") - pipe, err := cmd.StdinPipe() - if err != nil { - t.Fatal(err) - } - - if err := cmd.Start(); err != nil { - t.Fatal(err) - } - - manager.Pid = cmd.Process.Pid - manager.Path = filepath.Join("/singularity", strconv.Itoa(manager.Pid)) - - if err := manager.ApplyFromFile("example/cgroups.toml"); err != nil { - t.Fatal(err) - } - defer manager.Remove() - - manager.Pause() - - file, err := os.Open(fmt.Sprintf("/proc/%d/status", manager.Pid)) - if err != nil { - t.Error(err) - } - - scanner := bufio.NewScanner(file) - stateOk := false - - for scanner.Scan() { - if strings.HasPrefix(scanner.Text(), "State:\tD") { - stateOk = true - break - } - } - - if !stateOk { - t.Errorf("failed to pause process %d", manager.Pid) - } - - file.Close() - - manager.Resume() - - file, err = os.Open(fmt.Sprintf("/proc/%d/status", manager.Pid)) - if err != nil { - t.Error(err) - } - - scanner = bufio.NewScanner(file) - stateOk = false - - for scanner.Scan() { - text := scanner.Text() - if strings.HasPrefix(text, "State:\tS") || strings.HasPrefix(text, "State:\tR") { - stateOk = true - break - } - } - - if !stateOk { - t.Errorf("failed to resume process %d", manager.Pid) - } - - file.Close() - - pipe.Close() - - cmd.Wait() -} diff --git a/internal/pkg/cgroups/config_linux.go b/internal/pkg/cgroups/config_linux.go index 4fbe14f47e..af77814014 100644 --- a/internal/pkg/cgroups/config_linux.go +++ b/internal/pkg/cgroups/config_linux.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018, Sylabs Inc. All rights reserved. +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. // This software is licensed under a 3-clause BSD license. Please consult the // LICENSE.md file distributed with the sources of this project regarding your // rights to use or distribute this software. @@ -12,6 +12,13 @@ import ( "github.com/pelletier/go-toml" ) +func Int64ptr(i int) *int64 { + t := int64(i) + return &t +} + +var wildcard = Int64ptr(-1) + // LinuxHugepageLimit structure corresponds to limiting kernel hugepages type LinuxHugepageLimit struct { // Pagesize is the hugepage size @@ -160,6 +167,8 @@ type Config struct { // Limits are a set of key value pairs that define RDMA resource limits, // where the key is device name and value is resource limits. Rdma map[string]LinuxRdma `toml:"rdma" json:"rdma,omitempty"` + // TODO: Enable support for native cgroup v2 resource specifications + // Unified map[string]string `toml:"unified" json:"unified,omitempty"` } // LoadConfig opens cgroups controls config file and unmarshals it into structures diff --git a/internal/pkg/cgroups/example/cgroups-no-hugetlb.toml b/internal/pkg/cgroups/example/cgroups-no-hugetlb.toml index 0f0d99f155..f0b039fdf9 100644 --- a/internal/pkg/cgroups/example/cgroups-no-hugetlb.toml +++ b/internal/pkg/cgroups/example/cgroups-no-hugetlb.toml @@ -18,6 +18,7 @@ # realtimePeriod = 0 cpus = "0" mems = "0" + shares = 1024 # Memory restriction configuration diff --git a/internal/pkg/cgroups/example/cgroups.toml b/internal/pkg/cgroups/example/cgroups.toml index 6be2a1e156..d47e648f8b 100644 --- a/internal/pkg/cgroups/example/cgroups.toml +++ b/internal/pkg/cgroups/example/cgroups.toml @@ -11,13 +11,13 @@ # - cpus: CPUs to use within the cpuset. Default is to use any CPU available. # - mems: list of memory nodes in the cpuset. Default is to use any available memory node [cpu] -# shares = 512 # quotas = 0 # period = 0 # realtimeRuntime = 0 # realtimePeriod = 0 cpus = "0" mems = "0" + shares = 1024 # Memory restriction configuration diff --git a/internal/pkg/cgroups/manager_linux.go b/internal/pkg/cgroups/manager_linux.go new file mode 100644 index 0000000000..f2f60783a6 --- /dev/null +++ b/internal/pkg/cgroups/manager_linux.go @@ -0,0 +1,144 @@ +// Copyright (c) 2021, Sylabs Inc. All rights reserved. +// This software is licensed under a 3-clause BSD license. Please consult the +// LICENSE.md file distributed with the sources of this project regarding your +// rights to use or distribute this software. + +package cgroups + +import ( + "encoding/json" + "path/filepath" + "strconv" + + "github.com/containerd/cgroups" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sylabs/singularity/pkg/sylog" +) + +// Manager is used to work with cgroups resource restrictions. It is an +// interface satisfied by different implementations for v1 and v2 cgroups. +type Manager interface { + // GetVersion returns the version of the cgroups interface in use by + // the manager. + GetVersion() int + // GetCgroupRootPath returns the path to the root of the cgroup on the + // filesystem. + GetCgroupRootPath() string + // ApplyFromFile applies a cgroup configuration from a toml file, creating a + // new group if necessary, and places the process with Manager.Pid into the + // cgroup. + ApplyFromFile(path string) error + // ApplyFromSpec applies a cgroups configuration from an OCI LinuxResources + // spec struct, creating a new group if necessary, and places the process + // with Manager.Pid into the cgroup. + ApplyFromSpec(spec *specs.LinuxResources) error + // UpdateFromFile updates the existing managed cgroup using configuration + // from a toml file. + UpdateFromFile(path string) error + // UpdateFromSpec updates the existing managed cgroup using configuration + // from an OCI LinuxResources spec struct. + UpdateFromSpec(spec *specs.LinuxResources) error + // AddProc adds the process with specified pid to the managed cgroup + AddProc(pid int) error + // Remove deletes the managed cgroup. + Remove() error + // Pause freezes processes in the managed cgroup. + Pause() error + // Resume unfreezes process in the managed cgroup. + Resume() error +} + +// NewManagerFromFile creates a Manager, applies the configuration at specPath, and adds pid to the cgroup. +// If a group name is supplied, it will be used by the manager. +// If group = "" then "/singularity/" is used as a default. +func NewManagerFromFile(specPath string, pid int, group string) (manager Manager, err error) { + if group == "" { + group = filepath.Join("/singularity", strconv.Itoa(pid)) + } + if cgroups.Mode() == cgroups.Unified { + sylog.Debugf("Applying cgroups v2 configuration") + mgrv2 := ManagerV2{pid: pid, group: group} + return &mgrv2, mgrv2.ApplyFromFile(specPath) + } + + sylog.Debugf("Applying cgroups v1 configuration") + mgrv1 := ManagerV1{pid: pid, path: group} + return &mgrv1, mgrv1.ApplyFromFile(specPath) +} + +// NewManagerFromFile creates a Manager, applies the configuration in spec, and adds pid to the cgroup. +// If a group name is supplied, it will be used by the manager. +// If group = "" then "/singularity/" is used as a default. +func NewManagerFromSpec(spec *specs.LinuxResources, pid int, group string) (manager Manager, err error) { + if group == "" { + group = filepath.Join("/singularity", strconv.Itoa(pid)) + } + + if cgroups.Mode() == cgroups.Unified { + sylog.Debugf("Applying cgroups v2 configuration") + mgrv2 := ManagerV2{pid: pid, group: group} + return &mgrv2, mgrv2.ApplyFromSpec(spec) + } + + sylog.Debugf("Applying cgroups v1 configuration") + mgrv1 := ManagerV1{pid: pid, path: group} + return &mgrv1, mgrv1.ApplyFromSpec(spec) +} + +// GetManager returns a Manager for the provided cgroup name/path. +func GetManager(group string) (manager Manager, err error) { + if cgroups.Mode() == cgroups.Unified { + sylog.Debugf("Fetching cgroups v2 configuration") + mgrv2 := ManagerV2{group: group} + if err := mgrv2.loadFromGroup(); err != nil { + return nil, err + } + return &mgrv2, nil + } + + sylog.Debugf("Fetching cgroups v1 configuration") + mgrv1 := ManagerV1{path: group} + if err := mgrv1.loadFromPath(); err != nil { + return nil, err + } + return &mgrv1, nil +} + +// GetManagerFromPid returns a Manager for the cgroup that pid is a member of. +func GetManagerFromPid(pid int) (manager Manager, err error) { + if cgroups.Mode() == cgroups.Unified { + sylog.Debugf("Fetching cgroups v2 configuration") + mgrv2 := ManagerV2{pid: pid} + if err := mgrv2.loadFromPid(); err != nil { + return nil, err + } + return &mgrv2, nil + } + + sylog.Debugf("Fetching cgroups v1 configuration") + mgrv1 := ManagerV1{pid: pid} + if err := mgrv1.loadFromPid(); err != nil { + return nil, err + } + return &mgrv1, nil +} + +// readSpecFromFile loads a TOML file containing a specs.LinuxResources cgroups configuration. +func readSpecFromFile(path string) (spec specs.LinuxResources, err error) { + conf, err := LoadConfig(path) + if err != nil { + return + } + + // convert TOML structures to OCI JSON structures + data, err := json.Marshal(conf) + if err != nil { + return + } + + if err = json.Unmarshal(data, &spec); err != nil { + return + } + + return +} diff --git a/internal/pkg/cgroups/manager_linux_test.go b/internal/pkg/cgroups/manager_linux_test.go new file mode 100644 index 0000000000..a45743d5b0 --- /dev/null +++ b/internal/pkg/cgroups/manager_linux_test.go @@ -0,0 +1,67 @@ +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. +// This software is licensed under a 3-clause BSD license. Please consult the +// LICENSE.md file distributed with the sources of this project regarding your +// rights to use or distribute this software. + +package cgroups + +import ( + "bufio" + "fmt" + "os" + "strconv" + "strings" + "testing" +) + +// ensureIntInFile asserts that the content of path is the inteeger wantInt +func ensureIntInFile(t *testing.T, path string, wantInt int64) { + file, err := os.Open(path) + if err != nil { + t.Errorf("while opening %q: %v", path, err) + return + } + defer file.Close() + + scanner := bufio.NewScanner(file) + hasData := scanner.Scan() + if !hasData { + t.Errorf("no data found in %q", path) + } + + val, err := strconv.ParseInt(scanner.Text(), 10, 64) + if err != nil { + t.Errorf("could not parse %q: %v", path, err) + } + + if val != wantInt { + t.Errorf("found %d in %q, expected %d", val, path, wantInt) + } +} + +// ensureState asserts that a process pid has the required state +func ensureState(t *testing.T, pid int, wantStates string) { + file, err := os.Open(fmt.Sprintf("/proc/%d/status", pid)) + if err != nil { + t.Error(err) + } + defer file.Close() + + scanner := bufio.NewScanner(file) + procState := "" + + for scanner.Scan() { + // State: R (running) + if strings.HasPrefix(scanner.Text(), "State:\t") { + f := strings.Fields(scanner.Text()) + if len(f) < 2 { + t.Errorf("Could not check process state - not enough fields: %s", scanner.Text()) + } + procState = f[1] + } + } + + if !strings.ContainsAny(procState, wantStates) { + t.Errorf("Process %d had state %q, expected state %q", pid, procState, wantStates) + } +} diff --git a/internal/pkg/cgroups/managerv1_linux.go b/internal/pkg/cgroups/managerv1_linux.go new file mode 100644 index 0000000000..b72c0306f3 --- /dev/null +++ b/internal/pkg/cgroups/managerv1_linux.go @@ -0,0 +1,165 @@ +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. +// This software is licensed under a 3-clause BSD license. Please consult the +// LICENSE.md file distributed with the sources of this project regarding your +// rights to use or distribute this software. + +package cgroups + +import ( + "fmt" + "path/filepath" + "strings" + + "github.com/containerd/cgroups" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +// ManagerV1 manages a cgroup 'Path', containing process 'Pid' for a v1 cgroups hierarchy. +type ManagerV1 struct { + path string + pid int + cgroup cgroups.Cgroup +} + +func (m *ManagerV1) GetVersion() int { + return 1 +} + +func (m *ManagerV1) load() (err error) { + if m.path != "" { + return m.loadFromPath() + } + return m.loadFromPid() +} + +func (m *ManagerV1) loadFromPid() (err error) { + if m.pid == 0 { + return fmt.Errorf("cannot load from pid - no process ID specified") + } + path := cgroups.PidPath(m.pid) + m.cgroup, err = cgroups.Load(cgroups.V1, path) + return err +} + +func (m *ManagerV1) loadFromPath() (err error) { + if m.path == "" { + return fmt.Errorf("cannot load from path - no path specified") + } + path := cgroups.StaticPath(m.path) + m.cgroup, err = cgroups.Load(cgroups.V1, path) + return err +} + +// GetCgroupRootPath returns the path to the root of the cgroup on the +// filesystem. +func (m *ManagerV1) GetCgroupRootPath() string { + if m.cgroup == nil { + return "" + } + + for _, sub := range m.cgroup.Subsystems() { + processes, err := m.cgroup.Processes(sub.Name(), false) + if len(processes) == 0 || err != nil { + continue + } + process := processes[0] + cgroupPath := strings.Split(process.Path, string(sub.Name()))[0] + return filepath.Clean(cgroupPath) + } + + return "" +} + +// ApplyFromSpec applies a cgroups configuration from an OCI LinuxResources +// spec struct, creating a new group if necessary, and places the process +// with Manager.Pid into the cgroup. +func (m *ManagerV1) ApplyFromSpec(spec *specs.LinuxResources) (err error) { + var path cgroups.Path + + if !filepath.IsAbs(m.path) { + return fmt.Errorf("cgroup path must be an absolute path") + } + + path = cgroups.StaticPath(m.path) + + s := spec + if s == nil { + s = &specs.LinuxResources{} + } + + // creates cgroup + m.cgroup, err = cgroups.New(cgroups.V1, path, s) + if err != nil { + return err + } + + return m.cgroup.Add(cgroups.Process{Pid: m.pid}) +} + +// ApplyFromFile applies a cgroup configuration from a toml file, creating a +// new group if necessary, and places the process with Manager.Pid into the +// cgroup. +func (m *ManagerV1) ApplyFromFile(path string) error { + spec, err := readSpecFromFile(path) + if err != nil { + return err + } + return m.ApplyFromSpec(&spec) +} + +// UpdateFromSpec updates the existing managed cgroup using configuration +// from an OCI LinuxResources spec struct. +func (m *ManagerV1) UpdateFromSpec(spec *specs.LinuxResources) (err error) { + if m.cgroup == nil { + if err = m.load(); err != nil { + return + } + } + err = m.cgroup.Update(spec) + return +} + +// UpdateFromFile updates the existing managed cgroup using configuration +// from a toml file. +func (m *ManagerV1) UpdateFromFile(path string) error { + spec, err := readSpecFromFile(path) + if err != nil { + return err + } + return m.UpdateFromSpec(&spec) +} + +func (m *ManagerV1) AddProc(pid int) (err error) { + if m.cgroup == nil { + if err := m.load(); err != nil { + return err + } + } + return m.cgroup.Add(cgroups.Process{Pid: pid}) +} + +// Remove deletes the managed cgroup. +func (m *ManagerV1) Remove() error { + // deletes subgroup + return m.cgroup.Delete() +} + +// Pause freezes processes in the managed cgroup. +func (m *ManagerV1) Pause() error { + if m.cgroup == nil { + if err := m.load(); err != nil { + return err + } + } + return m.cgroup.Freeze() +} + +// Resume unfreezes process in the managed cgroup. +func (m *ManagerV1) Resume() error { + if m.cgroup == nil { + if err := m.load(); err != nil { + return err + } + } + return m.cgroup.Thaw() +} diff --git a/internal/pkg/cgroups/managerv1_linux_test.go b/internal/pkg/cgroups/managerv1_linux_test.go new file mode 100644 index 0000000000..5f055a53c0 --- /dev/null +++ b/internal/pkg/cgroups/managerv1_linux_test.go @@ -0,0 +1,112 @@ +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. +// This software is licensed under a 3-clause BSD license. Please consult the +// LICENSE.md file distributed with the sources of this project regarding your +// rights to use or distribute this software. + +package cgroups + +import ( + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strconv" + "testing" + + "github.com/sylabs/singularity/internal/pkg/test" + "github.com/sylabs/singularity/internal/pkg/test/tool/require" +) + +func TestCgroupsV1(t *testing.T) { + test.EnsurePrivilege(t) + require.CgroupsV1(t) + + cmd := exec.Command("/bin/cat", "/dev/zero") + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + defer cmd.Process.Kill() + + pid := cmd.Process.Pid + strPid := strconv.Itoa(pid) + path := filepath.Join("/singularity", strPid) + + manager := &ManagerV1{pid: pid, path: path} + + cgroupsToml := "example/cgroups.toml" + // Some systems, e.g. ppc64le may not have a 2MB page size, so don't + // apply a 2MB hugetlb limit if that's the case. + _, err := os.Stat("/sys/fs/cgroup/hugetlb/hugetlb.2MB.limit_in_bytes") + if os.IsNotExist(err) { + t.Log("No hugetlb.2MB.limit_in_bytes - using alternate cgroups test file") + cgroupsToml = "example/cgroups-no-hugetlb.toml" + } + + if err := manager.ApplyFromFile(cgroupsToml); err != nil { + t.Fatal(err) + } + defer manager.Remove() + + rootPath := manager.GetCgroupRootPath() + if rootPath == "" { + t.Fatalf("can't determine cgroups root path, is cgroups enabled ?") + } + + cpuShares := filepath.Join(rootPath, "cpu", path, "cpu.shares") + ensureIntInFile(t, cpuShares, 1024) + + content := []byte("[cpu]\nshares = 512") + tmpfile, err := ioutil.TempFile("", "cgroups") + if err != nil { + t.Fatal(err) + } + defer os.Remove(tmpfile.Name()) + if _, err := tmpfile.Write(content); err != nil { + t.Fatal(err) + } + if err := tmpfile.Close(); err != nil { + t.Fatal(err) + } + + // test update/load from PID + manager = &ManagerV1{pid: pid} + + if err := manager.UpdateFromFile(tmpfile.Name()); err != nil { + t.Fatal(err) + } + ensureIntInFile(t, cpuShares, 512) +} + +func TestPauseResumeV1(t *testing.T) { + test.EnsurePrivilege(t) + require.CgroupsV1(t) + + manager := &ManagerV1{} + if err := manager.Pause(); err == nil { + t.Errorf("unexpected success with PID 0") + } + if err := manager.Resume(); err == nil { + t.Errorf("unexpected success with PID 0") + } + + cmd := exec.Command("/bin/cat", "/dev/zero") + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + defer cmd.Process.Kill() + + manager.pid = cmd.Process.Pid + manager.path = filepath.Join("/singularity", strconv.Itoa(manager.pid)) + + if err := manager.ApplyFromFile("example/cgroups.toml"); err != nil { + t.Fatal(err) + } + defer manager.Remove() + + manager.Pause() + // cgroups v1 freeze is to uninterruptable sleep + ensureState(t, manager.pid, "D") + + manager.Resume() + ensureState(t, manager.pid, "RS") +} diff --git a/internal/pkg/cgroups/managerv2_linux.go b/internal/pkg/cgroups/managerv2_linux.go new file mode 100644 index 0000000000..e4ad4c380a --- /dev/null +++ b/internal/pkg/cgroups/managerv2_linux.go @@ -0,0 +1,217 @@ +// Copyright (c) 2021, Sylabs Inc. All rights reserved. +// This software is licensed under a 3-clause BSD license. Please consult the +// LICENSE.md file distributed with the sources of this project regarding your +// rights to use or distribute this software. + +package cgroups + +import ( + "fmt" + "path" + + cgroupsv2 "github.com/containerd/cgroups/v2" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sylabs/singularity/pkg/sylog" +) + +const mountPoint = "/sys/fs/cgroup" + +// ManagerV2 manages a cgroup 'Group', containing process 'Pid' for a v2 unified cgroups hierarchy. +type ManagerV2 struct { + group string + pid int + cgroup *cgroupsv2.Manager +} + +func (m *ManagerV2) load() (err error) { + if m.group != "" { + return m.loadFromGroup() + } + return m.loadFromPid() +} + +func (m *ManagerV2) loadFromPid() (err error) { + if m.pid == 0 { + return fmt.Errorf("cannot load from pid - no process ID specified") + } + group, err := cgroupsv2.PidGroupPath(m.pid) + if err != nil { + return fmt.Errorf("could not find group for pid %d: %v", m.pid, err) + } + m.cgroup, err = cgroupsv2.LoadManager(mountPoint, group) + return err +} + +func (m *ManagerV2) loadFromGroup() (err error) { + if m.group == "" { + return fmt.Errorf("cannot load from group - no group specified") + } + m.cgroup, err = cgroupsv2.LoadManager(mountPoint, m.group) + return err +} + +func (m *ManagerV2) GetVersion() int { + return 2 +} + +// GetCgroupRootPath returns cgroup root path +func (m *ManagerV2) GetCgroupRootPath() string { + if m.group == "" { + return "" + } + return path.Join(mountPoint, m.group) +} + +// ApplyFromSpec applies a cgroups configuration from an OCI LinuxResources spec +// struct, creating a new group if necessary, and places the process with +// Manager.Pid into the cgroup. The `Unified` key for native v2 cgroup +// specifications is not yet supported. +func (m *ManagerV2) ApplyFromSpec(spec *specs.LinuxResources) (err error) { + if len(spec.Unified) > 0 { + sylog.Warningf("Unified cgroup resource specifications are not supported, and will not be applied.") + } + if m.group == "" { + return fmt.Errorf("group must be specified when creating a cgroup") + } + if m.pid == 0 { + return fmt.Errorf("pid must be specified when creating a cgroup") + } + + s := spec + if s == nil { + s = &specs.LinuxResources{} + } + + // translate the LinuxResources cgroups v1 / OCI spec to v2 Resources + res := cgroupsv2.ToResources(s) + // v1 device restrictions have to manually be brought across into the v2 + // Resources struct, as ToResources(s) doesn't do this. They will then be + // converted to ebpf programs and attached when the cgroup is created. + res.Devices = v2FixDevices(s.Devices) + + // creates cgroup + m.cgroup, err = cgroupsv2.NewManager(mountPoint, m.group, res) + if err != nil { + return err + } + + return m.cgroup.AddProc(uint64(m.pid)) +} + +// ApplyFromFile applies a cgroup configuration from a toml file, creating a new +// group if necessary, and places the process with Manager.Pid into the cgroup. +// The `Unified` key for native v2 cgroup specifications is not yet supported. +func (m *ManagerV2) ApplyFromFile(path string) error { + spec, err := readSpecFromFile(path) + if err != nil { + return err + } + return m.ApplyFromSpec(&spec) +} + +// UpdateFromSpec updates the existing managed cgroup using configuration from +// an OCI LinuxResources spec struct. The `Unified` key for native v2 cgroup +// specifications is not yet supported. +func (m *ManagerV2) UpdateFromSpec(spec *specs.LinuxResources) (err error) { + if len(spec.Unified) > 0 { + sylog.Warningf("Unified cgroup resource specifications are not supported, and will not be applied.") + } + if m.group == "" { + if m.pid == 0 { + return fmt.Errorf("pid must be provided if group is not known") + } + m.group, err = cgroupsv2.PidGroupPath(m.pid) + if err != nil { + return fmt.Errorf("could not find group for pid %d: %v", m.pid, err) + } + } + + s := spec + if s == nil { + s = &specs.LinuxResources{} + } + + // translate the LinuxResources cgroupsv1 / OCI spec to v2 Resources + res := cgroupsv2.ToResources(s) + // v1 device restrictions have to manually be brought across into the v2 Resources struct, + // as ToResources doesn't do this. They will then be converted to ebpf programs and attached. + res.Devices = v2FixDevices(s.Devices) + + // updates existing cgroup + m.cgroup, err = cgroupsv2.NewManager(mountPoint, m.group, res) + if err != nil { + return err + } + + return err +} + +// UpdateFromFile updates the existing managed cgroup using configuration +// from a toml file. +func (m *ManagerV2) UpdateFromFile(path string) error { + spec, err := readSpecFromFile(path) + if err != nil { + return err + } + return m.UpdateFromSpec(&spec) +} + +// Remove deletes the managed cgroup. +func (m *ManagerV2) Remove() (err error) { + // deletes subgroup + return m.cgroup.Delete() +} + +func (m *ManagerV2) AddProc(pid int) (err error) { + if m.cgroup == nil { + if err := m.load(); err != nil { + return err + } + } + return m.cgroup.AddProc(uint64(pid)) +} + +// Pause freezes processes in the managed cgroup. +func (m *ManagerV2) Pause() (err error) { + if m.cgroup == nil { + if err := m.load(); err != nil { + return err + } + } + return m.cgroup.Freeze() +} + +// Resume unfreezes process in the managed cgroup. +func (m *ManagerV2) Resume() (err error) { + if m.cgroup == nil { + if err := m.load(); err != nil { + return err + } + } + return m.cgroup.Thaw() +} + +// v2FixDevices modifies device entries to use an explicit, rather than implied +// wildcard. +// +// containerd/cgroups v1 device handling accepts: +// "" for type, which is replaced as "a" +// nil for major/minor, which is replaced as -1 +// +// containerd/cgroups v2 will not handle the "" and nil, and the explicit +// wildcard is needed. +func v2FixDevices(devs []specs.LinuxDeviceCgroup) []specs.LinuxDeviceCgroup { + for i, d := range devs { + if d.Type == "" { + d.Type = "a" + } + if d.Major == nil { + d.Major = wildcard + } + if d.Minor == nil { + d.Minor = wildcard + } + devs[i] = d + } + return devs +} diff --git a/internal/pkg/cgroups/managerv2_linux_test.go b/internal/pkg/cgroups/managerv2_linux_test.go new file mode 100644 index 0000000000..7122265f39 --- /dev/null +++ b/internal/pkg/cgroups/managerv2_linux_test.go @@ -0,0 +1,122 @@ +// Copyright (c) 2021, Sylabs Inc. All rights reserved. +// This software is licensed under a 3-clause BSD license. Please consult the +// LICENSE.md file distributed with the sources of this project regarding your +// rights to use or distribute this software. + +package cgroups + +import ( + "io/ioutil" + "os" + "os/exec" + "path" + "path/filepath" + "strconv" + "testing" + + "github.com/sylabs/singularity/internal/pkg/test" + "github.com/sylabs/singularity/internal/pkg/test/tool/require" +) + +func TestCgroupsV2(t *testing.T) { + test.EnsurePrivilege(t) + require.CgroupsV2(t) + + // Create process to put into a cgroup + cmd := exec.Command("/bin/cat", "/dev/zero") + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + defer cmd.Process.Kill() + + pid := cmd.Process.Pid + strPid := strconv.Itoa(pid) + group := filepath.Join("/singularity", strPid) + + manager := &ManagerV2{pid: pid, group: group} + + // Example sets various things - we will check [pids] limit = 1024 + cgroupsToml := "example/cgroups.toml" + // Some systems, e.g. ppc64le may not have a 2MB page size, so don't + // apply a 2MB hugetlb limit if that's the case. + _, err := os.Stat("/sys/fs/cgroup/dev-hugepages.mount/hugetlb.2MB.max") + if os.IsNotExist(err) { + t.Log("No hugetlb.2MB.max - using alternate cgroups test file") + cgroupsToml = "example/cgroups-no-hugetlb.toml" + } + + // Create a new cgroup with example config + if err := manager.ApplyFromFile(cgroupsToml); err != nil { + t.Fatal(err) + } + defer manager.Remove() + + // For cgroups v2 [pids] limit -> pids.max + // Check for corrrect 1024 value + pidsMax := filepath.Join(mountPoint, group, "pids.max") + ensureIntInFile(t, pidsMax, 1024) + + // Write a new config with [pids] limit = 512 + content := []byte("[pids]\nlimit = 512") + tmpfile, err := ioutil.TempFile("", "cgroups") + if err != nil { + t.Fatal(err) + } + defer os.Remove(tmpfile.Name()) + if _, err := tmpfile.Write(content); err != nil { + t.Fatal(err) + } + if err := tmpfile.Close(); err != nil { + t.Fatal(err) + } + + // test update/load from PID + manager = &ManagerV2{pid: pid} + + // Update existing cgroup from new config + if err := manager.UpdateFromFile(tmpfile.Name()); err != nil { + t.Fatal(err) + } + + // Check pids.max is now 512 + ensureIntInFile(t, pidsMax, 512) +} + +func TestPauseResumeV2(t *testing.T) { + test.EnsurePrivilege(t) + require.CgroupsV2(t) + + manager := &ManagerV2{} + if err := manager.Pause(); err == nil { + t.Errorf("unexpected success with PID 0") + } + if err := manager.Resume(); err == nil { + t.Errorf("unexpected success with PID 0") + } + + cmd := exec.Command("/bin/cat", "/dev/zero") + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + defer cmd.Process.Kill() + + manager.pid = cmd.Process.Pid + manager.group = filepath.Join("/singularity", strconv.Itoa(manager.pid)) + + if err := manager.ApplyFromFile("example/cgroups.toml"); err != nil { + t.Fatal(err) + } + defer manager.Remove() + + manager.Pause() + // cgroups v2 freeze is to interruptable sleep, which could actually occur + // for our cat /dev/zero while it's running, so check freeze marker as well + // as the process state here. + ensureState(t, manager.pid, "S") + freezePath := path.Join(mountPoint, manager.group, "cgroup.freeze") + ensureIntInFile(t, freezePath, 1) + + manager.Resume() + ensureState(t, manager.pid, "RS") + ensureIntInFile(t, freezePath, 0) +} diff --git a/internal/pkg/instance/instance_linux.go b/internal/pkg/instance/instance_linux.go index 9af0793240..a4d019f2e3 100644 --- a/internal/pkg/instance/instance_linux.go +++ b/internal/pkg/instance/instance_linux.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2019, Sylabs Inc. All rights reserved. +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. // This software is licensed under a 3-clause BSD license. Please consult the // LICENSE.md file distributed with the sources of this project regarding your // rights to use or distribute this software. @@ -46,6 +46,7 @@ type File struct { Image string `json:"image"` Config []byte `json:"config"` UserNs bool `json:"userns"` + Cgroup bool `json:"cgroup"` IP string `json:"ip"` LogErrPath string `json:"logErrPath"` LogOutPath string `json:"logOutPath"` diff --git a/internal/pkg/runtime/engine/config/oci/config.go b/internal/pkg/runtime/engine/config/oci/config.go index 8eb6f42243..cb2418a311 100644 --- a/internal/pkg/runtime/engine/config/oci/config.go +++ b/internal/pkg/runtime/engine/config/oci/config.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, Sylabs Inc. All rights reserved. +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. // This software is licensed under a 3-clause BSD license. Please consult the // LICENSE.md file distributed with the sources of this project regarding your // rights to use or distribute this software. @@ -9,6 +9,7 @@ import ( "encoding/json" "fmt" + "github.com/containerd/cgroups" specs "github.com/opencontainers/runtime-spec/specs-go" cseccomp "github.com/seccomp/containers-golang" "github.com/sylabs/singularity/internal/pkg/runtime/engine/config/oci/generate" @@ -36,8 +37,17 @@ func (c *Config) UnmarshalJSON(b []byte) error { } // DefaultConfig returns an OCI config generator with a -// default OCI configuration. +// default OCI configuration for cgroups v1 or v2 dependent on the current host. func DefaultConfig() (*generate.Generator, error) { + if cgroups.Mode() == cgroups.Unified { + return DefaultConfigV2() + } + return DefaultConfigV1() +} + +// DefaultConfigV1 returns an OCI config generator with a +// default OCI configuration for cgroups v1. +func DefaultConfigV1() (*generate.Generator, error) { var err error config := specs.Spec{ @@ -193,6 +203,8 @@ func DefaultConfig() (*generate.Generator, error) { config.Linux = &specs.Linux{ Resources: &specs.LinuxResources{ Devices: []specs.LinuxDeviceCgroup{ + // Wildcard blocking access to all devices by default. + // Note that essential cgroupDevices allow rules are inserted ahead of this. { Allow: false, Access: "rwm", @@ -227,3 +239,30 @@ func DefaultConfig() (*generate.Generator, error) { return &generate.Generator{Config: &config}, nil } + +// DefaultConfigV2 returns an OCI config generator with a default OCI configuration for cgroups v2. +// This is identical to v1 except that we use a cgroup namespace, and mount the namespaced +// cgroup fs into the container. +func DefaultConfigV2() (*generate.Generator, error) { + gen, err := DefaultConfigV1() + if err != nil { + return nil, err + } + c := gen.Config + + // TODO: Enter a cgroup namespace + // See https://github.com/sylabs/singularity/issues/298 + // We need to be unsharing the namespace at an appropriate point before we can enable this. + // + // c.Linux.Namespaces = append(c.Linux.Namespaces, specs.LinuxNamespace{Type: "cgroup"}) + + // Mount the unified cgroup v2 hierarchy + c.Mounts = append(c.Mounts, specs.Mount{ + Destination: "/sys/fs/cgroup", + Type: "cgroup2", + Source: "cgroup2", + Options: []string{"nosuid", "noexec", "nodev", "ro"}, + }) + + return &generate.Generator{Config: c}, nil +} diff --git a/internal/pkg/runtime/engine/oci/cleanup_linux.go b/internal/pkg/runtime/engine/oci/cleanup_linux.go index 39810006b8..d768f10d20 100644 --- a/internal/pkg/runtime/engine/oci/cleanup_linux.go +++ b/internal/pkg/runtime/engine/oci/cleanup_linux.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, Sylabs Inc. All rights reserved. +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. // This software is licensed under a 3-clause BSD license. Please consult the // LICENSE.md file distributed with the sources of this project regarding your // rights to use or distribute this software. @@ -29,7 +29,9 @@ import ( // command set requires privileged execution. func (e *EngineOperations) CleanupContainer(ctx context.Context, fatal error, status syscall.WaitStatus) error { if e.EngineConfig.Cgroups != nil { - e.EngineConfig.Cgroups.Remove() + if err := e.EngineConfig.Cgroups.Remove(); err != nil { + sylog.Warningf("failed to remove cgroup configuration: %v", err) + } } pidFile := e.EngineConfig.GetPidFile() diff --git a/internal/pkg/runtime/engine/oci/config_linux.go b/internal/pkg/runtime/engine/oci/config_linux.go index 2573185da6..6951b0e355 100644 --- a/internal/pkg/runtime/engine/oci/config_linux.go +++ b/internal/pkg/runtime/engine/oci/config_linux.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018, Sylabs Inc. All rights reserved. +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. // This software is licensed under a 3-clause BSD license. Please consult the // LICENSE.md file distributed with the sources of this project regarding your // rights to use or distribute this software. @@ -18,20 +18,20 @@ const Name = "oci" // EngineConfig is the config for the OCI engine. type EngineConfig struct { - BundlePath string `json:"bundlePath"` - LogPath string `json:"logPath"` - LogFormat string `json:"logFormat"` - PidFile string `json:"pidFile"` - OciConfig *oci.Config `json:"ociConfig"` - MasterPts int `json:"masterPts"` - SlavePts int `json:"slavePts"` - OutputStreams [2]int `json:"outputStreams"` - ErrorStreams [2]int `json:"errorStreams"` - InputStreams [2]int `json:"inputStreams"` - SyncSocket string `json:"syncSocket"` - EmptyProcess bool `json:"emptyProcess"` - Exec bool `json:"exec"` - Cgroups *cgroups.Manager `json:"-"` + BundlePath string `json:"bundlePath"` + LogPath string `json:"logPath"` + LogFormat string `json:"logFormat"` + PidFile string `json:"pidFile"` + OciConfig *oci.Config `json:"ociConfig"` + MasterPts int `json:"masterPts"` + SlavePts int `json:"slavePts"` + OutputStreams [2]int `json:"outputStreams"` + ErrorStreams [2]int `json:"errorStreams"` + InputStreams [2]int `json:"inputStreams"` + SyncSocket string `json:"syncSocket"` + EmptyProcess bool `json:"emptyProcess"` + Exec bool `json:"exec"` + Cgroups cgroups.Manager `json:"-"` sync.Mutex `json:"-"` State ociruntime.State `json:"state"` diff --git a/internal/pkg/runtime/engine/oci/create_linux.go b/internal/pkg/runtime/engine/oci/create_linux.go index 1c0de42552..9068de8a87 100644 --- a/internal/pkg/runtime/engine/oci/create_linux.go +++ b/internal/pkg/runtime/engine/oci/create_linux.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, Sylabs Inc. All rights reserved. +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. // This software is licensed under a 3-clause BSD license. Please consult the // LICENSE.md file distributed with the sources of this project regarding your // rights to use or distribute this software. @@ -62,86 +62,82 @@ var devices = []device{ {1, 5, "/dev/zero", syscall.S_IFCHR | 0o666, 0, 0}, } -func int64ptr(i int) *int64 { - t := int64(i) - return &t -} - var cgroupDevices = []specs.LinuxDeviceCgroup{ { Allow: true, Type: "c", - Major: int64ptr(1), - Minor: int64ptr(7), + Major: cgroups.Int64ptr(1), + Minor: cgroups.Int64ptr(7), Access: "rw", }, { Allow: true, Type: "c", - Major: int64ptr(1), - Minor: int64ptr(3), + Major: cgroups.Int64ptr(1), + Minor: cgroups.Int64ptr(3), Access: "rw", }, { Allow: true, Type: "c", - Major: int64ptr(1), - Minor: int64ptr(8), + Major: cgroups.Int64ptr(1), + Minor: cgroups.Int64ptr(8), Access: "rw", }, { Allow: true, Type: "c", - Major: int64ptr(5), - Minor: int64ptr(0), + Major: cgroups.Int64ptr(5), + Minor: cgroups.Int64ptr(0), Access: "rw", }, { Allow: true, Type: "c", - Major: int64ptr(1), - Minor: int64ptr(9), + Major: cgroups.Int64ptr(1), + Minor: cgroups.Int64ptr(9), Access: "rw", }, { Allow: true, Type: "c", - Major: int64ptr(1), - Minor: int64ptr(5), + Major: cgroups.Int64ptr(1), + Minor: cgroups.Int64ptr(5), Access: "rw", }, { Allow: true, Type: "c", - Major: int64ptr(136), + Major: cgroups.Int64ptr(136), + Minor: cgroups.Int64ptr(-1), Access: "rwm", }, { Allow: true, Type: "c", - Major: int64ptr(5), - Minor: int64ptr(1), + Major: cgroups.Int64ptr(5), + Minor: cgroups.Int64ptr(1), Access: "rw", }, { Allow: true, Type: "c", - Major: int64ptr(5), - Minor: int64ptr(2), + Major: cgroups.Int64ptr(5), + Minor: cgroups.Int64ptr(2), Access: "rw", }, } type container struct { - engine *EngineOperations - rpcOps *client.RPC - rootfs string - rpcRoot string - userNS bool - utsNS bool - mntNS bool - devIndex int - cgroupIndex int + engine *EngineOperations + rpcOps *client.RPC + rootfs string + rpcRoot string + userNS bool + utsNS bool + mntNS bool + devIndex int + cgroupV1MountIndex int } var statusChan = make(chan string, 1) @@ -191,12 +187,12 @@ func (e *EngineOperations) CreateContainer(ctx context.Context, pid int, rpcConn } c := &container{ - engine: e, - rpcOps: rpcOps, - rootfs: resolvedRootfs, - rpcRoot: fmt.Sprintf("/proc/%d/root", pid), - cgroupIndex: -1, - devIndex: -1, + engine: e, + rpcOps: rpcOps, + rootfs: resolvedRootfs, + rpcRoot: fmt.Sprintf("/proc/%d/root", pid), + cgroupV1MountIndex: -1, + devIndex: -1, } for _, ns := range e.EngineConfig.OciConfig.Linux.Namespaces { @@ -220,9 +216,9 @@ func (e *EngineOperations) CreateContainer(ctx context.Context, pid int, rpcConn system := &mount.System{Points: p, Mount: c.mount} for i, point := range e.EngineConfig.OciConfig.Config.Mounts { - // cgroup creation + // A cgroup v1 mount point will be intercepted and handled separately in c.addCgroups(...) if point.Type == "cgroup" { - c.cgroupIndex = i + c.cgroupV1MountIndex = i continue } // dev creation @@ -498,17 +494,18 @@ func (c *container) addCgroups(pid int, system *mount.System) error { c.engine.EngineConfig.OciConfig.Linux.CgroupsPath = cgroupsPath - manager := &cgroups.Manager{Path: cgroupsPath, Pid: pid} - - if err := manager.ApplyFromSpec(c.engine.EngineConfig.OciConfig.Linux.Resources); err != nil { + manager, err := cgroups.NewManagerFromSpec(c.engine.EngineConfig.OciConfig.Linux.Resources, pid, cgroupsPath) + if err != nil { return fmt.Errorf("failed to apply cgroups resources restriction: %s", err) } - if c.cgroupIndex >= 0 { - m := c.engine.EngineConfig.OciConfig.Config.Mounts[c.cgroupIndex] + // If a mount point exists for a cgroup v1 hierarchy we will handle it here. + // This is not necessary for cgroups v2 - as the unified hierarchy will be handled with a simple bind. + if c.cgroupV1MountIndex >= 0 { + m := c.engine.EngineConfig.OciConfig.Config.Mounts[c.cgroupV1MountIndex] c.engine.EngineConfig.OciConfig.Config.Mounts = append( - c.engine.EngineConfig.OciConfig.Config.Mounts[:c.cgroupIndex], - c.engine.EngineConfig.OciConfig.Config.Mounts[c.cgroupIndex+1:]..., + c.engine.EngineConfig.OciConfig.Config.Mounts[:c.cgroupV1MountIndex], + c.engine.EngineConfig.OciConfig.Config.Mounts[c.cgroupV1MountIndex+1:]..., ) cgroupRootPath := manager.GetCgroupRootPath() @@ -807,7 +804,11 @@ func (c *container) addDevices(system *mount.System) error { c.engine.EngineConfig.OciConfig.Linux.Resources = &specs.LinuxResources{} } - c.engine.EngineConfig.OciConfig.Linux.Resources.Devices = append(c.engine.EngineConfig.OciConfig.Linux.Resources.Devices, cgroupDevices...) + // cgroupDevices are essential for operation, so must be allowed *prior* to a configured wildcard deny. + // containerd/cgroups/v2 device filtering via eBPF is written such that it stops at the wildcard. + // See: https://github.com/containerd/cgroups/blob/ddda8a174e9ae86b31366812ae2d0f9f9570a7f1/v2/devicefilter.go#L93 + // https://github.com/containerd/cgroups/blob/ddda8a174e9ae86b31366812ae2d0f9f9570a7f1/v2/devicefilter.go#L164 + c.engine.EngineConfig.OciConfig.Linux.Resources.Devices = append(cgroupDevices, c.engine.EngineConfig.OciConfig.Linux.Resources.Devices...) } return nil diff --git a/internal/pkg/runtime/engine/oci/prepare_linux.go b/internal/pkg/runtime/engine/oci/prepare_linux.go index debed8b0b4..4cd182e478 100644 --- a/internal/pkg/runtime/engine/oci/prepare_linux.go +++ b/internal/pkg/runtime/engine/oci/prepare_linux.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, Sylabs Inc. All rights reserved. +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. // This software is licensed under a 3-clause BSD license. Please consult the // LICENSE.md file distributed with the sources of this project regarding your // rights to use or distribute this software. @@ -9,9 +9,9 @@ import ( "fmt" "os" - "github.com/containerd/cgroups" "github.com/kr/pty" specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sylabs/singularity/internal/pkg/cgroups" "github.com/sylabs/singularity/internal/pkg/runtime/engine/config/starter" "github.com/sylabs/singularity/pkg/ociruntime" "github.com/sylabs/singularity/pkg/sylog" @@ -208,16 +208,15 @@ func (e *EngineOperations) PrepareConfig(starterConfig *starter.Config) error { if cPath == "" { return nil } - - // add executed process to container cgroups ppid := os.Getppid() - staticPath := cgroups.StaticPath(cPath) - control, err := cgroups.Load(cgroups.V1, staticPath) + + sylog.Debugf("Adding process %d to instance cgroup %q", ppid, cPath) + manager, err := cgroups.GetManager(cPath) if err != nil { - return fmt.Errorf("failed to load cgroups: %s", err) + return fmt.Errorf("couldn't create cgroup manager: %v", err) } - if err := control.Add(cgroups.Process{Pid: ppid}); err != nil { - return fmt.Errorf("failed to add exec process to cgroups %s: %s", cPath, err) + if err := manager.AddProc(ppid); err != nil { + return fmt.Errorf("couldn't add process to instance cgroup: %v", err) } } diff --git a/internal/pkg/runtime/engine/singularity/cleanup_linux.go b/internal/pkg/runtime/engine/singularity/cleanup_linux.go index c0241d51b3..339b07ad6a 100644 --- a/internal/pkg/runtime/engine/singularity/cleanup_linux.go +++ b/internal/pkg/runtime/engine/singularity/cleanup_linux.go @@ -89,8 +89,8 @@ func (e *EngineOperations) CleanupContainer(ctx context.Context, fatal error, st } } - if cgroupManager != nil { - if err := cgroupManager.Remove(); err != nil { + if cgroupsManager != nil { + if err := cgroupsManager.Remove(); err != nil { sylog.Errorf("could not remove cgroups: %v", err) } } diff --git a/internal/pkg/runtime/engine/singularity/container_linux.go b/internal/pkg/runtime/engine/singularity/container_linux.go index ed8d1554af..477b202988 100644 --- a/internal/pkg/runtime/engine/singularity/container_linux.go +++ b/internal/pkg/runtime/engine/singularity/container_linux.go @@ -50,11 +50,11 @@ import ( // - cleanup // - post start process var ( - cryptDev string - networkSetup *network.Setup - cgroupManager *cgroups.Manager - imageDriver image.Driver - umountPoints []string + cryptDev string + networkSetup *network.Setup + imageDriver image.Driver + umountPoints []string + cgroupsManager cgroups.Manager ) // defaultCNIConfPath is the default directory to CNI network configuration files. @@ -297,10 +297,9 @@ func create(ctx context.Context, engine *EngineOperations, rpcOps *client.RPC, p if os.Geteuid() == 0 && !c.userNS { path := engine.EngineConfig.GetCgroupsPath() if path != "" { - cgroupPath := filepath.Join("/singularity", strconv.Itoa(pid)) - cgroupManager = &cgroups.Manager{Pid: pid, Path: cgroupPath} - if err := cgroupManager.ApplyFromFile(path); err != nil { - return fmt.Errorf("failed to apply cgroups resources restriction: %s", err) + cgroupsManager, err = cgroups.NewManagerFromFile(path, pid, "") + if err != nil { + return fmt.Errorf("while applying cgroups config: %v", err) } } } diff --git a/internal/pkg/runtime/engine/singularity/prepare_linux.go b/internal/pkg/runtime/engine/singularity/prepare_linux.go index d326028b8d..6e8c5e2691 100644 --- a/internal/pkg/runtime/engine/singularity/prepare_linux.go +++ b/internal/pkg/runtime/engine/singularity/prepare_linux.go @@ -19,9 +19,9 @@ import ( "syscall" "github.com/ProtonMail/go-crypto/openpgp" - "github.com/containerd/cgroups" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/sylabs/singularity/internal/pkg/buildcfg" + "github.com/sylabs/singularity/internal/pkg/cgroups" fakerootutil "github.com/sylabs/singularity/internal/pkg/fakeroot" "github.com/sylabs/singularity/internal/pkg/instance" "github.com/sylabs/singularity/internal/pkg/plugin" @@ -899,14 +899,15 @@ func (e *EngineOperations) prepareInstanceJoinConfig(starterConfig *starter.Conf e.EngineConfig.OciConfig.Linux.Seccomp = instanceEngineConfig.OciConfig.Linux.Seccomp } - if uid == 0 && !file.UserNs { - pid := os.Getppid() - path := fmt.Sprintf("/singularity/%d", file.Pid) - control, err := cgroups.Load(cgroups.V1, cgroups.StaticPath(path)) - if err == nil { - if err := control.Add(cgroups.Process{Pid: pid}); err != nil { - return fmt.Errorf("while adding process to instance cgroups: %s", err) - } + if file.Cgroup { + sylog.Debugf("Adding process to instance cgroup") + ppid := os.Getppid() + manager, err := cgroups.GetManagerFromPid(file.Pid) + if err != nil { + return fmt.Errorf("couldn't create cgroup manager: %v", err) + } + if err := manager.AddProc(ppid); err != nil { + return fmt.Errorf("couldn't add process to instance cgroup: %v", err) } } diff --git a/internal/pkg/runtime/engine/singularity/process_linux.go b/internal/pkg/runtime/engine/singularity/process_linux.go index 3c1e1e1cde..8a98052cef 100644 --- a/internal/pkg/runtime/engine/singularity/process_linux.go +++ b/internal/pkg/runtime/engine/singularity/process_linux.go @@ -398,6 +398,12 @@ func (e *EngineOperations) PostStartProcess(ctx context.Context, pid int) error } } + // If we are using cgroups with this instance then mark that in the instance config. + // We don't store the path, as we will get the cgroup manager by Pid. + if e.EngineConfig.GetCgroupsPath() != "" { + file.Cgroup = true + } + // grab configuration to store in instance file file.Config, err = json.Marshal(e.CommonConfig) if err != nil { diff --git a/internal/pkg/test/tool/require/require.go b/internal/pkg/test/tool/require/require.go index a142cd00c6..9b59cad30d 100644 --- a/internal/pkg/test/tool/require/require.go +++ b/internal/pkg/test/tool/require/require.go @@ -122,12 +122,30 @@ func Network(t *testing.T) { } } -// Cgroups checks that cgroups is enabled, if not the +// Cgroups checks that any cgroups version is enabled, if not the // current test is skipped with a message. func Cgroups(t *testing.T) { - _, err := cgroups.V1() - if err != nil { - t.Skipf("cgroups disabled") + mode := cgroups.Mode() + if mode == cgroups.Unavailable { + t.Skipf("cgroups not available") + } +} + +// CgroupsV1 checks that cgroups v1 is enabled, if not the +// current test is skipped with a message. +func CgroupsV1(t *testing.T) { + mode := cgroups.Mode() + if mode != cgroups.Legacy && mode != cgroups.Hybrid { + t.Skipf("cgroups v1 not available") + } +} + +// CgroupsV2 checks that cgroups v2 is enabled, if not the +// current test is skipped with a message. +func CgroupsV2(t *testing.T) { + mode := cgroups.Mode() + if mode != cgroups.Unified { + t.Skipf("cgroups v2 unified mode not available") } } @@ -135,6 +153,10 @@ func Cgroups(t *testing.T) { // available, if not the current test is skipped with a // message func CgroupsFreezer(t *testing.T) { + if cgroups.Mode() == cgroups.Unified { + return + } + subSys, err := cgroups.V1() if err != nil { t.Skipf("cgroups disabled") diff --git a/internal/pkg/util/fs/mount/mount_linux.go b/internal/pkg/util/fs/mount/mount_linux.go index 89af03caa5..8bb60d8a80 100644 --- a/internal/pkg/util/fs/mount/mount_linux.go +++ b/internal/pkg/util/fs/mount/mount_linux.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2019, Sylabs Inc. All rights reserved. +// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved. // This software is licensed under a 3-clause BSD license. Please consult the // LICENSE.md file distributed with the sources of this project regarding your // rights to use or distribute this software. @@ -156,6 +156,7 @@ var authorizedFS = map[string]fsContext{ "proc": {false}, "mqueue": {false}, "cgroup": {false}, + "cgroup2": {false}, "fuse": {false}, }