diff --git a/CHANGELOG.md b/CHANGELOG.md index f0fe44a01c..963c6b8146 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -87,6 +87,8 @@ For older changes see the [archived Singularity change log](https://github.com/a - `--rocm` to bind ROCm GPU libraries and devices into the container. - `--nv` to bind Nvidia driver / basic CUDA libraries and devices into the container. + - `--apply-cgroups`, and the `--cpu*`, `--blkio*`, `--memory*`, + `--pids-limit` flags to apply resource limits. ### Other changes diff --git a/e2e/cgroups/cgroups.go b/e2e/cgroups/cgroups.go index 2538c3106f..645511159c 100644 --- a/e2e/cgroups/cgroups.go +++ b/e2e/cgroups/cgroups.go @@ -255,9 +255,7 @@ func (c *ctx) instanceStatsRootless(t *testing.T) { c.instanceStats(t, e2e.UserProfile) } -func (c *ctx) actionApply(t *testing.T, profile e2e.Profile) { - e2e.EnsureImage(t, c.env) - +func (c *ctx) actionApply(t *testing.T, profile e2e.Profile, imageRef string) { tests := []struct { name string args []string @@ -265,64 +263,92 @@ func (c *ctx) actionApply(t *testing.T, profile e2e.Profile) { expectErrorOut string rootfull bool rootless bool + skipOCI bool + onlyOCI bool }{ { name: "nonexistent toml", - args: []string{"--apply-cgroups", "testdata/cgroups/doesnotexist.toml", c.env.ImagePath, "/bin/sleep", "5"}, + args: []string{"--apply-cgroups", "testdata/cgroups/doesnotexist.toml", imageRef, "/bin/sleep", "5"}, expectErrorCode: 255, expectErrorOut: "no such file or directory", rootfull: true, rootless: true, + skipOCI: false, + onlyOCI: false, }, { name: "invalid toml", - args: []string{"--apply-cgroups", "testdata/cgroups/invalid.toml", c.env.ImagePath, "/bin/sleep", "5"}, + args: []string{"--apply-cgroups", "testdata/cgroups/invalid.toml", imageRef, "/bin/sleep", "5"}, expectErrorCode: 255, expectErrorOut: "toml: expected character", rootfull: true, rootless: true, + skipOCI: false, + onlyOCI: false, }, { name: "memory limit", - args: []string{"--apply-cgroups", "testdata/cgroups/memory_limit.toml", c.env.ImagePath, "/bin/sleep", "5"}, + args: []string{"--apply-cgroups", "testdata/cgroups/memory_limit.toml", imageRef, "/bin/sleep", "5"}, expectErrorCode: 137, rootfull: true, rootless: true, + skipOCI: true, + onlyOCI: false, + }, + { + name: "memory limit oci", + args: []string{"--apply-cgroups", "testdata/cgroups/memory_limit.toml", imageRef, "/bin/sleep", "5"}, + // crun returns a 1 when the OOM kill happens. + expectErrorCode: 1, + rootfull: true, + rootless: true, + skipOCI: false, + onlyOCI: true, }, { name: "cpu success", - args: []string{"--apply-cgroups", "testdata/cgroups/cpu_success.toml", c.env.ImagePath, "/bin/true"}, + args: []string{"--apply-cgroups", "testdata/cgroups/cpu_success.toml", imageRef, "/bin/true"}, expectErrorCode: 0, rootfull: true, // This currently fails in the e2e scenario due to the way we are using a mount namespace. // It *does* work if you test it, directly calling the apptainer CLI. // Reason is believed to be: https://github.com/opencontainers/runc/issues/3026 rootless: false, + skipOCI: false, + onlyOCI: false, }, // Device access is allowed by default. { name: "device allow default", - args: []string{"--apply-cgroups", "testdata/cgroups/null.toml", c.env.ImagePath, "cat", "/dev/null"}, + args: []string{"--apply-cgroups", "testdata/cgroups/null.toml", imageRef, "cat", "/dev/null"}, expectErrorCode: 0, rootfull: true, rootless: true, + skipOCI: false, + onlyOCI: false, }, // Device limits are properly applied only in rootful mode. Rootless will ignore them with a warning. { name: "device deny", - args: []string{"--apply-cgroups", "testdata/cgroups/deny_device.toml", c.env.ImagePath, "cat", "/dev/null"}, + args: []string{"--apply-cgroups", "testdata/cgroups/deny_device.toml", imageRef, "cat", "/dev/null"}, expectErrorCode: 1, expectErrorOut: "Operation not permitted", rootfull: true, rootless: false, + // runc/crun always allow /dev/null access + skipOCI: true, + onlyOCI: false, }, { name: "device ignored", - args: []string{"--apply-cgroups", "testdata/cgroups/deny_device.toml", c.env.ImagePath, "cat", "/dev/null"}, + args: []string{"--apply-cgroups", "testdata/cgroups/deny_device.toml", imageRef, "cat", "/dev/null"}, expectErrorCode: 0, expectErrorOut: "Device limits will not be applied with rootless cgroups", rootfull: false, rootless: true, + // runc/crun silently ignore in rootless + skipOCI: true, + onlyOCI: false, }, } @@ -334,6 +360,13 @@ func (c *ctx) actionApply(t *testing.T, profile e2e.Profile) { if !profile.Privileged() && !tt.rootless { t.Skip() } + if profile.OCI() && tt.skipOCI { + t.Skip() + } + if !profile.OCI() && tt.onlyOCI { + t.Skip() + } + exitFunc := []e2e.ApptainerCmdResultOp{} if tt.expectErrorOut != "" { exitFunc = []e2e.ApptainerCmdResultOp{e2e.ExpectError(e2e.ContainMatch, tt.expectErrorOut)} @@ -350,13 +383,27 @@ func (c *ctx) actionApply(t *testing.T, profile e2e.Profile) { } func (c *ctx) actionApplyRoot(t *testing.T) { - c.actionApply(t, e2e.RootProfile) + e2e.EnsureImage(t, c.env) + e2e.EnsureOCIImage(t, c.env) + t.Run(e2e.RootProfile.String(), func(t *testing.T) { + c.actionApply(t, e2e.RootProfile, c.env.ImagePath) + }) + t.Run(e2e.OCIRootProfile.String(), func(t *testing.T) { + c.actionApply(t, e2e.OCIRootProfile, "oci-archive:"+c.env.OCIImagePath) + }) } func (c *ctx) actionApplyRootless(t *testing.T) { + e2e.EnsureImage(t, c.env) + e2e.EnsureOCIImage(t, c.env) for _, profile := range []e2e.Profile{e2e.UserProfile, e2e.UserNamespaceProfile, e2e.FakerootProfile} { t.Run(profile.String(), func(t *testing.T) { - c.actionApply(t, profile) + c.actionApply(t, profile, c.env.ImagePath) + }) + } + for _, profile := range []e2e.Profile{e2e.OCIUserProfile, e2e.OCIFakerootProfile} { + t.Run(profile.String(), func(t *testing.T) { + c.actionApply(t, profile, "oci-archive:"+c.env.OCIImagePath) }) } } @@ -499,21 +546,21 @@ var resourceFlagTests = []resourceFlagTest{ }, } -func (c *ctx) actionFlags(t *testing.T, profile e2e.Profile) { +func (c *ctx) actionFlags(t *testing.T, profile e2e.Profile, imageRef string) { e2e.EnsureImage(t, c.env) for _, tt := range resourceFlagTests { t.Run(tt.name, func(t *testing.T) { if cgroups.IsCgroup2UnifiedMode() { - c.actionFlagV2(t, tt, profile) + c.actionFlagV2(t, tt, profile, imageRef) return } - c.actionFlagV1(t, tt, profile) + c.actionFlagV1(t, tt, profile, imageRef) }) } } -func (c *ctx) actionFlagV1(t *testing.T, tt resourceFlagTest, profile e2e.Profile) { +func (c *ctx) actionFlagV1(t *testing.T, tt resourceFlagTest, profile e2e.Profile, imageRef string) { // Don't try to test a resource that doesn't exist in our caller cgroup. // E.g. some systems don't have memory.memswp, and might not have blkio.bfq require.CgroupsResourceExists(t, tt.controllerV1, tt.resourceV1) @@ -530,7 +577,7 @@ func (c *ctx) actionFlagV1(t *testing.T, tt resourceFlagTest, profile e2e.Profil } args := tt.args - args = append(args, "-B", "/sys/fs/cgroup", c.env.ImagePath, "/bin/sh", "-c", shellCmd) + args = append(args, "-B", "/sys/fs/cgroup", imageRef, "/bin/sh", "-c", shellCmd) c.env.RunApptainer( t, @@ -541,7 +588,7 @@ func (c *ctx) actionFlagV1(t *testing.T, tt resourceFlagTest, profile e2e.Profil ) } -func (c *ctx) actionFlagV2(t *testing.T, tt resourceFlagTest, profile e2e.Profile) { +func (c *ctx) actionFlagV2(t *testing.T, tt resourceFlagTest, profile e2e.Profile, imageRef string) { if tt.skipV2 { t.Skip() } @@ -566,7 +613,7 @@ func (c *ctx) actionFlagV2(t *testing.T, tt resourceFlagTest, profile e2e.Profil shellCmd := fmt.Sprintf("cat /sys/fs/cgroup$(cat /proc/self/cgroup | grep '^0::' | cut -d ':' -f 3)/%s", tt.resourceV2) args := tt.args - args = append(args, "-B", "/sys/fs/cgroup", c.env.ImagePath, "/bin/sh", "-c", shellCmd) + args = append(args, "-B", "/sys/fs/cgroup", imageRef, "/bin/sh", "-c", shellCmd) c.env.RunApptainer( t, @@ -578,13 +625,27 @@ func (c *ctx) actionFlagV2(t *testing.T, tt resourceFlagTest, profile e2e.Profil } func (c *ctx) actionFlagsRoot(t *testing.T) { - c.actionFlags(t, e2e.RootProfile) + e2e.EnsureImage(t, c.env) + e2e.EnsureOCIImage(t, c.env) + t.Run(e2e.RootProfile.String(), func(t *testing.T) { + c.actionFlags(t, e2e.RootProfile, c.env.ImagePath) + }) + t.Run(e2e.OCIRootProfile.String(), func(t *testing.T) { + c.actionFlags(t, e2e.OCIRootProfile, "oci-archive:"+c.env.OCIImagePath) + }) } func (c *ctx) actionFlagsRootless(t *testing.T) { + e2e.EnsureImage(t, c.env) + e2e.EnsureOCIImage(t, c.env) for _, profile := range []e2e.Profile{e2e.UserProfile, e2e.UserNamespaceProfile, e2e.FakerootProfile} { t.Run(profile.String(), func(t *testing.T) { - c.actionFlags(t, profile) + c.actionFlags(t, profile, c.env.ImagePath) + }) + } + for _, profile := range []e2e.Profile{e2e.OCIUserProfile, e2e.OCIFakerootProfile} { + t.Run(profile.String(), func(t *testing.T) { + c.actionFlags(t, profile, "oci-archive:"+c.env.OCIImagePath) }) } } diff --git a/e2e/env/oci.go b/e2e/env/oci.go index 444d8128fb..7632b52a85 100644 --- a/e2e/env/oci.go +++ b/e2e/env/oci.go @@ -76,6 +76,7 @@ func (c ctx) ociApptainerEnv(t *testing.T) { e2e.WithProfile(e2e.OCIUserProfile), e2e.WithCommand("exec"), e2e.WithEnv(tt.env), + e2e.WithRootlessEnv(), e2e.WithArgs(tt.image, "/bin/sh", "-c", "echo $PATH"), e2e.ExpectExit( 0, @@ -185,6 +186,7 @@ func (c ctx) ociEnvOption(t *testing.T) { e2e.WithProfile(e2e.OCIUserProfile), e2e.WithCommand("exec"), e2e.WithEnv(tt.hostEnv), + e2e.WithRootlessEnv(), e2e.WithArgs(args...), e2e.ExpectExit( 0, @@ -287,6 +289,7 @@ func (c ctx) ociEnvFile(t *testing.T) { e2e.WithProfile(e2e.OCIUserProfile), e2e.WithCommand("exec"), e2e.WithEnv(tt.hostEnv), + e2e.WithRootlessEnv(), e2e.WithArgs(args...), e2e.ExpectExit( 0, diff --git a/e2e/internal/e2e/apptainercmd.go b/e2e/internal/e2e/apptainercmd.go index 47af2c1dfa..e08bd729c3 100644 --- a/e2e/internal/e2e/apptainercmd.go +++ b/e2e/internal/e2e/apptainercmd.go @@ -535,6 +535,20 @@ func (env TestEnv) RunApptainer(t *testing.T, cmdOps ...ApptainerCmdOp) { cmd.Env = os.Environ() } + // Clear user-specific DBUS / XDG vars when we are using a priv profile, + // as they don't make sense for the root user... and wouldn't be set in a + // real root user session. + if privileged { + i := 0 + for _, e := range cmd.Env { + if !(strings.HasPrefix(e, "DBUS_SESSION_BUS_ADDRESS=") || strings.HasPrefix(e, "XDG_RUNTIME_DIR=")) { + cmd.Env[i] = e + i++ + } + } + cmd.Env = cmd.Env[:i] + } + // By default, each E2E command shares a temporary image cache // directory. If a test is directly testing the cache, or depends on // specific ordered cache behavior then diff --git a/e2e/internal/e2e/profile.go b/e2e/internal/e2e/profile.go index 61d45fad76..930c86e782 100644 --- a/e2e/internal/e2e/profile.go +++ b/e2e/internal/e2e/profile.go @@ -68,6 +68,7 @@ type Profile struct { requirementsFn func(*testing.T) // function checking requirements for the profile apptainerOption string // option added to apptainer command for the profile optionForCommands []string // apptainer commands concerned by the option to be added + oci bool // whether the profile uses the OCI low-level runtime } // NativeProfiles defines all available profiles for the native apptainer runtime @@ -81,6 +82,7 @@ var NativeProfiles = map[string]Profile{ requirementsFn: nil, apptainerOption: "", optionForCommands: []string{}, + oci: false, }, rootProfile: { name: "Root", @@ -91,6 +93,7 @@ var NativeProfiles = map[string]Profile{ requirementsFn: nil, apptainerOption: "", optionForCommands: []string{}, + oci: false, }, fakerootProfile: { name: "Fakeroot", @@ -101,6 +104,7 @@ var NativeProfiles = map[string]Profile{ requirementsFn: fakerootRequirements, apptainerOption: "--fakeroot", optionForCommands: []string{"shell", "exec", "run", "test", "instance start", "build"}, + oci: false, }, userNamespaceProfile: { name: "UserNamespace", @@ -111,6 +115,7 @@ var NativeProfiles = map[string]Profile{ requirementsFn: require.UserNamespace, apptainerOption: "--userns", optionForCommands: []string{"shell", "exec", "run", "test", "instance start"}, + oci: false, }, rootUserNamespaceProfile: { name: "RootUserNamespace", @@ -121,6 +126,7 @@ var NativeProfiles = map[string]Profile{ requirementsFn: require.UserNamespace, apptainerOption: "--userns", optionForCommands: []string{"shell", "exec", "run", "test", "instance start"}, + oci: false, }, } @@ -135,6 +141,7 @@ var OCIProfiles = map[string]Profile{ requirementsFn: ociRequirements, apptainerOption: "--oci", optionForCommands: []string{"shell", "exec", "run", "test", "instance start"}, + oci: true, }, ociRootProfile: { name: "OCIRoot", @@ -145,6 +152,7 @@ var OCIProfiles = map[string]Profile{ requirementsFn: ociRequirements, apptainerOption: "--oci", optionForCommands: []string{"shell", "exec", "run", "test", "instance start"}, + oci: true, }, ociFakerootProfile: { name: "OCIFakeroot", @@ -155,6 +163,7 @@ var OCIProfiles = map[string]Profile{ requirementsFn: ociRequirements, apptainerOption: "--oci --fakeroot", optionForCommands: []string{"shell", "exec", "run", "test", "instance start"}, + oci: true, }, } @@ -176,6 +185,11 @@ func (p Profile) Privileged() bool { return p.privileged } +// OCI returns whether the profile is using an OCI runtime, rather than the apptainer native runtime. +func (p Profile) OCI() bool { + return p.oci +} + // Requirements calls the different require.* functions // necessary for running an E2E test under this profile. func (p Profile) Requirements(t *testing.T) { diff --git a/internal/app/apptainer/oci_linux.go b/internal/app/apptainer/oci_linux.go index 2e1732a202..d102aee784 100644 --- a/internal/app/apptainer/oci_linux.go +++ b/internal/app/apptainer/oci_linux.go @@ -14,9 +14,12 @@ package apptainer import ( "context" + "fmt" + "github.com/apptainer/apptainer/internal/pkg/buildcfg" "github.com/apptainer/apptainer/internal/pkg/runtime/launcher/oci" ocibundle "github.com/apptainer/apptainer/pkg/ocibundle/sif" + "github.com/apptainer/apptainer/pkg/util/apptainerconf" ) // OciArgs contains CLI arguments @@ -34,52 +37,92 @@ type OciArgs struct { // OciRun runs a container (equivalent to create/start/delete) func OciRun(ctx context.Context, containerID string, args *OciArgs) error { - return oci.Run(ctx, containerID, args.BundlePath, args.PidFile) + systemdCgroups, err := systemdCgroups() + if err != nil { + return err + } + return oci.Run(ctx, containerID, args.BundlePath, args.PidFile, systemdCgroups) } // OciCreate creates a container from an OCI bundle func OciCreate(containerID string, args *OciArgs) error { - return oci.Create(containerID, args.BundlePath) + systemdCgroups, err := systemdCgroups() + if err != nil { + return err + } + return oci.Create(containerID, args.BundlePath, systemdCgroups) } // OciStart starts a previously create container func OciStart(containerID string) error { - return oci.Start(containerID) + systemdCgroups, err := systemdCgroups() + if err != nil { + return err + } + return oci.Start(containerID, systemdCgroups) } // OciDelete deletes container resources func OciDelete(ctx context.Context, containerID string) error { - return oci.Delete(ctx, containerID) + systemdCgroups, err := systemdCgroups() + if err != nil { + return err + } + return oci.Delete(ctx, containerID, systemdCgroups) } // OciExec executes a command in a container func OciExec(containerID string, cmdArgs []string) error { - return oci.Exec(containerID, cmdArgs) + systemdCgroups, err := systemdCgroups() + if err != nil { + return err + } + return oci.Exec(containerID, cmdArgs, systemdCgroups) } // OciKill kills container process func OciKill(containerID string, killSignal string) error { - return oci.Kill(containerID, killSignal) + systemdCgroups, err := systemdCgroups() + if err != nil { + return err + } + return oci.Kill(containerID, killSignal, systemdCgroups) } // OciPause pauses processes in a container func OciPause(containerID string) error { - return oci.Pause(containerID) + systemdCgroups, err := systemdCgroups() + if err != nil { + return err + } + return oci.Pause(containerID, systemdCgroups) } // OciResume pauses processes in a container func OciResume(containerID string) error { - return oci.Resume(containerID) + systemdCgroups, err := systemdCgroups() + if err != nil { + return err + } + return oci.Resume(containerID, systemdCgroups) } // OciState queries container state func OciState(containerID string, args *OciArgs) error { - return oci.State(containerID) + systemdCgroups, err := systemdCgroups() + if err != nil { + return err + } + return oci.State(containerID, systemdCgroups) } // OciUpdate updates container cgroups resources func OciUpdate(containerID string, args *OciArgs) error { - return oci.Update(containerID, args.FromFile) + systemdCgroups, err := systemdCgroups() + if err != nil { + return err + } + return oci.Update(containerID, args.FromFile, systemdCgroups) } // OciMount mount a SIF image to create an OCI bundle @@ -99,3 +142,14 @@ func OciUmount(bundle string) error { } return d.Delete() } + +func systemdCgroups() (use bool, err error) { + cfg := apptainerconf.GetCurrentConfig() + if cfg == nil { + cfg, err = apptainerconf.Parse(buildcfg.APPTAINER_CONF_FILE) + if err != nil { + return false, fmt.Errorf("unable to parse apptainer configuration file: %w", err) + } + } + return cfg.SystemdCgroups, nil +} diff --git a/internal/pkg/cgroups/manager_linux.go b/internal/pkg/cgroups/manager_linux.go index e630b93b6e..8dc9717c9e 100644 --- a/internal/pkg/cgroups/manager_linux.go +++ b/internal/pkg/cgroups/manager_linux.go @@ -14,7 +14,6 @@ import ( "fmt" "os" "path/filepath" - "strconv" "strings" "github.com/apptainer/apptainer/internal/pkg/util/env" @@ -344,15 +343,8 @@ func NewManagerWithSpec(spec *specs.LinuxResources, pid int, group string, syste if pid == 0 { return nil, fmt.Errorf("a pid is required to create a new cgroup") } - if group == "" && !systemd { - group = filepath.Join("/apptainer", strconv.Itoa(pid)) - } - if group == "" && systemd { - if os.Getuid() == 0 { - group = "system.slice:apptainer:" + strconv.Itoa(pid) - } else { - group = "user.slice:apptainer:" + strconv.Itoa(pid) - } + if group == "" { + group = DefaultPathForPid(systemd, pid) } sylog.Debugf("Creating cgroups manager for %s", group) diff --git a/internal/pkg/cgroups/util.go b/internal/pkg/cgroups/util.go index 2c27fdcf82..96ddc7a340 100644 --- a/internal/pkg/cgroups/util.go +++ b/internal/pkg/cgroups/util.go @@ -11,6 +11,9 @@ package cgroups import ( "fmt" + "os" + "path/filepath" + "strconv" lccgroups "github.com/opencontainers/runc/libcontainer/cgroups" ) @@ -51,3 +54,24 @@ func pidToPath(pid int) (path string, err error) { } return path, nil } + +// DefaultPathForPid returns a default cgroup path for a given PID. +func DefaultPathForPid(systemd bool, pid int) (group string) { + // Default naming is pid of first process added to cgroup. + strPid := strconv.Itoa(pid) + // Request is for an empty cgroup... name it for the requestor's (our) pid. + if pid == -1 { + strPid = "parent-" + strconv.Itoa(os.Getpid()) + } + + if systemd { + if os.Getuid() == 0 { + group = "system.slice:apptainer:" + strPid + } else { + group = "user.slice:apptainer:" + strPid + } + } else { + group = filepath.Join("/apptainer", strPid) + } + return group +} diff --git a/internal/pkg/runtime/launcher/oci/launcher_linux.go b/internal/pkg/runtime/launcher/oci/launcher_linux.go index 74413b6669..e661d22bfa 100644 --- a/internal/pkg/runtime/launcher/oci/launcher_linux.go +++ b/internal/pkg/runtime/launcher/oci/launcher_linux.go @@ -24,6 +24,7 @@ import ( "github.com/apptainer/apptainer/internal/pkg/buildcfg" "github.com/apptainer/apptainer/internal/pkg/cache" + "github.com/apptainer/apptainer/internal/pkg/cgroups" "github.com/apptainer/apptainer/internal/pkg/runtime/launcher" "github.com/apptainer/apptainer/internal/pkg/util/fs/files" "github.com/apptainer/apptainer/internal/pkg/util/user" @@ -162,10 +163,6 @@ func checkOpts(lo launcher.Options) error { badOpt = append(badOpt, "NoUmask") } - if lo.CGroupsJSON != "" { - badOpt = append(badOpt, "CGroupsJSON") - } - // ConfigFile always set by CLI. We should support only the default from build time. if lo.ConfigFile != "" && lo.ConfigFile != buildcfg.APPTAINER_CONF_FILE { badOpt = append(badOpt, "ConfigFile") @@ -225,6 +222,15 @@ func (l *Launcher) createSpec() (*specs.Spec, error) { } spec.Mounts = mounts + cgPath, resources, err := l.getCgroup() + if err != nil { + return nil, err + } + if cgPath != "" { + spec.Linux.CgroupsPath = cgPath + spec.Linux.Resources = resources + } + return &spec, nil } @@ -404,9 +410,10 @@ func (l *Launcher) Exec(ctx context.Context, image string, process string, args if os.Getuid() == 0 { // Direct execution of runc/crun run. - err = Run(ctx, id.String(), b.Path(), "") + err = Run(ctx, id.String(), b.Path(), "", l.apptainerConf.SystemdCgroups) } else { // Reexec apptainer oci run in a userns with mappings. + // Note - the oci run command will pull out the SystemdCgroups setting from config. err = RunNS(ctx, id.String(), b.Path(), "") } var exitErr *exec.ExitError @@ -416,6 +423,19 @@ func (l *Launcher) Exec(ctx context.Context, image string, process string, args return err } +// getCgroup will return a cgroup path and resources for the runtime to create. +func (l *Launcher) getCgroup() (path string, resources *specs.LinuxResources, err error) { + if l.cfg.CGroupsJSON == "" { + return "", nil, nil + } + path = cgroups.DefaultPathForPid(l.apptainerConf.SystemdCgroups, -1) + resources, err = cgroups.UnmarshalJSONResources(l.cfg.CGroupsJSON) + if err != nil { + return "", nil, err + } + return path, resources, nil +} + func mergeMap(a map[string]string, b map[string]string) map[string]string { for k, v := range b { a[k] = v diff --git a/internal/pkg/runtime/launcher/oci/oci_conmon_linux.go b/internal/pkg/runtime/launcher/oci/oci_conmon_linux.go index ba69214991..2b3a9c261c 100644 --- a/internal/pkg/runtime/launcher/oci/oci_conmon_linux.go +++ b/internal/pkg/runtime/launcher/oci/oci_conmon_linux.go @@ -38,7 +38,7 @@ type ociError struct { } // Create creates a container from an OCI bundle -func Create(containerID, bundlePath string) error { +func Create(containerID, bundlePath string, systemdCgroups bool) error { conmon, err := bin.FindBin("conmon") if err != nil { return err @@ -120,6 +120,10 @@ func Create(containerID, bundlePath string) error { "--exit-command-arg", containerID, } + if systemdCgroups { + cmdArgs = append(cmdArgs, "--systemd-cgroup") + } + cmd := exec.Command(conmon, cmdArgs...) cmd.Dir = absBundle cmd.Env = append(cmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", 3), fmt.Sprintf("_OCI_STARTPIPE=%d", 4)) @@ -157,7 +161,7 @@ func Create(containerID, bundlePath string) error { // We check for errors from runc (which conmon invokes) via the sync pipe pid, err := readConmonPipeData(syncParent, path.Join(sd, runcLogFile)) if err != nil { - if err2 := Delete(context.TODO(), containerID); err2 != nil { + if err2 := Delete(context.TODO(), containerID, systemdCgroups); err2 != nil { sylog.Errorf("Removing container %s from runtime after creation failed", containerID) } return err diff --git a/internal/pkg/runtime/launcher/oci/oci_runc_linux.go b/internal/pkg/runtime/launcher/oci/oci_runc_linux.go index cc3a5d693b..fa919dcc34 100644 --- a/internal/pkg/runtime/launcher/oci/oci_runc_linux.go +++ b/internal/pkg/runtime/launcher/oci/oci_runc_linux.go @@ -28,7 +28,7 @@ import ( ) // Delete deletes container resources -func Delete(ctx context.Context, containerID string) error { +func Delete(ctx context.Context, containerID string, systemdCgroups bool) error { runtimeBin, err := runtime() if err != nil { return err @@ -40,9 +40,11 @@ func Delete(ctx context.Context, containerID string) error { runtimeArgs := []string{ "--root", rsd, - "delete", - containerID, } + if systemdCgroups { + runtimeArgs = append(runtimeArgs, "--systemd-cgroup") + } + runtimeArgs = append(runtimeArgs, "delete", containerID) cmd := exec.Command(runtimeBin, runtimeArgs...) cmd.Stdout = os.Stdout @@ -75,7 +77,7 @@ func Delete(ctx context.Context, containerID string) error { } // Exec executes a command in a container -func Exec(containerID string, cmdArgs []string) error { +func Exec(containerID string, cmdArgs []string, systemdCgroups bool) error { runtimeBin, err := runtime() if err != nil { return err @@ -87,9 +89,11 @@ func Exec(containerID string, cmdArgs []string) error { runtimeArgs := []string{ "--root", rsd, - "exec", - containerID, } + if systemdCgroups { + runtimeArgs = append(runtimeArgs, "--systemd-cgroup") + } + runtimeArgs = append(runtimeArgs, "exec", containerID) runtimeArgs = append(runtimeArgs, cmdArgs...) cmd := exec.Command(runtimeBin, runtimeArgs...) cmd.Stdout = os.Stdout @@ -100,7 +104,7 @@ func Exec(containerID string, cmdArgs []string) error { } // Kill kills container process -func Kill(containerID string, killSignal string) error { +func Kill(containerID string, killSignal string, systemdCgroups bool) error { runtimeBin, err := runtime() if err != nil { return err @@ -126,7 +130,7 @@ func Kill(containerID string, killSignal string) error { } // Pause pauses processes in a container -func Pause(containerID string) error { +func Pause(containerID string, systemdCgroups bool) error { runtimeBin, err := runtime() if err != nil { return err @@ -138,9 +142,11 @@ func Pause(containerID string) error { runtimeArgs := []string{ "--root", rsd, - "pause", - containerID, } + if systemdCgroups { + runtimeArgs = append(runtimeArgs, "--systemd-cgroup") + } + runtimeArgs = append(runtimeArgs, "pause", containerID) cmd := exec.Command(runtimeBin, runtimeArgs...) cmd.Stdout = os.Stdout @@ -151,7 +157,7 @@ func Pause(containerID string) error { } // Resume pauses processes in a container -func Resume(containerID string) error { +func Resume(containerID string, systemdCgroups bool) error { runtimeBin, err := runtime() if err != nil { return err @@ -163,9 +169,11 @@ func Resume(containerID string) error { runtimeArgs := []string{ "--root", rsd, - "resume", - containerID, } + if systemdCgroups { + runtimeArgs = append(runtimeArgs, "--systemd-cgroup") + } + runtimeArgs = append(runtimeArgs, "resume", containerID) cmd := exec.Command(runtimeBin, runtimeArgs...) cmd.Stdout = os.Stdout @@ -176,7 +184,7 @@ func Resume(containerID string) error { } // Run runs a container (equivalent to create/start/delete) -func Run(ctx context.Context, containerID, bundlePath, pidFile string) error { +func Run(ctx context.Context, containerID, bundlePath, pidFile string, systemdCgroups bool) error { runtimeBin, err := runtime() if err != nil { return err @@ -197,12 +205,15 @@ func Run(ctx context.Context, containerID, bundlePath, pidFile string) error { runtimeArgs := []string{ "--root", rsd, - "run", - "-b", absBundle, } + if systemdCgroups { + runtimeArgs = append(runtimeArgs, "--systemd-cgroup") + } + runtimeArgs = append(runtimeArgs, "run", "-b", absBundle) if pidFile != "" { runtimeArgs = append(runtimeArgs, "--pid-file="+pidFile) } + runtimeArgs = append(runtimeArgs, containerID) cmd := exec.Command(runtimeBin, runtimeArgs...) cmd.Stdout = os.Stdout @@ -237,9 +248,13 @@ func RunNS(ctx context.Context, containerID, bundlePath, pidFile string) error { sylog.Debugf("Calling fakeroot engine to execute %q", strings.Join(args, " ")) cfg := &config.Common{ - EngineName: fakerootConfig.Name, - ContainerID: "fakeroot", - EngineConfig: &fakerootConfig.EngineConfig{Args: args, NoPIDNS: true}, + EngineName: fakerootConfig.Name, + ContainerID: "fakeroot", + EngineConfig: &fakerootConfig.EngineConfig{ + Envs: os.Environ(), + Args: args, + NoPIDNS: true, + }, } return starter.Run( @@ -252,7 +267,7 @@ func RunNS(ctx context.Context, containerID, bundlePath, pidFile string) error { } // Start starts a previously created container -func Start(containerID string) error { +func Start(containerID string, systemdCgroups bool) error { runtimeBin, err := runtime() if err != nil { return err @@ -264,9 +279,11 @@ func Start(containerID string) error { runtimeArgs := []string{ "--root", rsd, - "start", - containerID, } + if systemdCgroups { + runtimeArgs = append(runtimeArgs, "--systemd-cgroup") + } + runtimeArgs = append(runtimeArgs, "start", containerID) cmd := exec.Command(runtimeBin, runtimeArgs...) cmd.Stdout = os.Stdout @@ -277,7 +294,7 @@ func Start(containerID string) error { } // State queries container state -func State(containerID string) error { +func State(containerID string, systemdCgroups bool) error { runtimeBin, err := runtime() if err != nil { return err @@ -289,9 +306,11 @@ func State(containerID string) error { runtimeArgs := []string{ "--root", rsd, - "state", - containerID, } + if systemdCgroups { + runtimeArgs = append(runtimeArgs, "--systemd-cgroup") + } + runtimeArgs = append(runtimeArgs, "state", containerID) cmd := exec.Command(runtimeBin, runtimeArgs...) cmd.Stdout = os.Stdout @@ -302,7 +321,7 @@ func State(containerID string) error { } // Update updates container cgroups resources -func Update(containerID, cgFile string) error { +func Update(containerID, cgFile string, systemdCgroups bool) error { runtimeBin, err := runtime() if err != nil { return err @@ -314,10 +333,11 @@ func Update(containerID, cgFile string) error { runtimeArgs := []string{ "--root", rsd, - "update", - "-r", cgFile, - containerID, } + if systemdCgroups { + runtimeArgs = append(runtimeArgs, "--systemd-cgroup") + } + runtimeArgs = append(runtimeArgs, "update", "-r", cgFile, containerID) cmd := exec.Command(runtimeBin, runtimeArgs...) cmd.Stdout = os.Stdout