Skip to content

Commit

Permalink
oci: Add --fakeroot support to --oci mode
Browse files Browse the repository at this point in the history
Initial --fakeroot support for --oci mode. Mirrors behavior with
--compat / --contain.

Closes #1035
  • Loading branch information
dtrudg committed Nov 25, 2022
1 parent 06c3521 commit f855964
Show file tree
Hide file tree
Showing 13 changed files with 389 additions and 132 deletions.
8 changes: 8 additions & 0 deletions cmd/internal/cli/oci_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
package cli

import (
"errors"
"os"
"os/exec"

"github.com/spf13/cobra"
"github.com/sylabs/singularity/docs"
"github.com/sylabs/singularity/internal/app/singularity"
Expand Down Expand Up @@ -150,6 +154,10 @@ var OciRunCmd = &cobra.Command{
PreRun: CheckRoot,
Run: func(cmd *cobra.Command, args []string) {
if err := singularity.OciRun(cmd.Context(), args[0], &ociArgs); err != nil {
var exitErr *exec.ExitError
if errors.As(err, &exitErr) {
os.Exit(exitErr.ExitCode())
}
sylog.Fatalf("%s", err)
}
},
Expand Down
6 changes: 3 additions & 3 deletions e2e/actions/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func (c actionTests) actionOciRun(t *testing.T) {
},
}

for _, profile := range []e2e.Profile{e2e.OCIRootProfile, e2e.OCIUserProfile} {
for _, profile := range e2e.OCIProfiles {
t.Run(profile.String(), func(t *testing.T) {
for _, tt := range tests {
cmdArgs := []string{tt.imageRef}
Expand Down Expand Up @@ -144,7 +144,7 @@ func (c actionTests) actionOciExec(t *testing.T) {
exit: 0,
},
}
for _, profile := range []e2e.Profile{e2e.OCIRootProfile, e2e.OCIUserProfile} {
for _, profile := range e2e.OCIProfiles {
t.Run(profile.String(), func(t *testing.T) {
for _, tt := range tests {
c.env.RunSingularity(
Expand Down Expand Up @@ -198,7 +198,7 @@ func (c actionTests) actionOciShell(t *testing.T) {
},
}

for _, profile := range []e2e.Profile{e2e.OCIRootProfile, e2e.OCIUserProfile} {
for _, profile := range e2e.OCIProfiles {
t.Run(profile.String(), func(t *testing.T) {
for _, tt := range tests {
c.env.RunSingularity(
Expand Down
1 change: 1 addition & 0 deletions internal/pkg/runtime/engine/fakeroot/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ type EngineConfig struct {
Envs []string `json:"envs"`
Home string `json:"home"`
BuildEnv bool `json:"buildEnv"`
NoPIDNS bool `json:"NoPIDNS"`
}
6 changes: 5 additions & 1 deletion internal/pkg/runtime/engine/fakeroot/engine_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,11 @@ func (e *EngineOperations) PrepareConfig(starterConfig *starter.Config) error {

g.AddOrReplaceLinuxNamespace(specs.UserNamespace, "")
g.AddOrReplaceLinuxNamespace(specs.MountNamespace, "")
g.AddOrReplaceLinuxNamespace(specs.PIDNamespace, "")

// If we enter a PID NS in the --oci action -> oci run flow, then crun / runc will fail.
if !e.EngineConfig.NoPIDNS {
g.AddOrReplaceLinuxNamespace(specs.PIDNamespace, "")
}

uid := uint32(os.Getuid())
gid := uint32(os.Getgid())
Expand Down
142 changes: 142 additions & 0 deletions internal/pkg/runtime/launcher/oci/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# internal/pkg/runtime/launcher/oci

This package contains routines that configure and launch a container in an OCI
bundle format, using a low-level OCI runtime, either `crun` or `runc` at this
time. `crun` is currently preferred. `runc` is used where `crun` is not
available.

**Note** - at present, all functionality works with either `crun` or `runc`.
However, in future `crun` may be required for all functionality, as `runc` does
not support some limited ID mappings etc. that may be beneficial in an HPC
scenario.

The package contrasts with `internal/pkg/runtime/launcher/native` which executes
Singularity format containers (SIF/Sandbox/squashfs/ext3), using one of our own
runtime engines (`internal/pkg/runtime/engine/*`).

There are two flows that are implemented here.

* Basic OCI runtime operations agains an existing bundle, which will be executed
via the `singularity oci` command group. These are not widely used by
end-users of singularity.
* A `Launcher`, that implements an `Exec` function that will be called by
'actions' (run/shell/exec) in `--oci` mode, and will:
* Prepare an OCI bundle according to `launcher.Options` passed through from
the CLI layer.
* Execute the bundle, interactively, via the OCI Run operation.

**Note** - this area of code is under heavy development for experimental
introduction in CE 3.11. It is likely that it will be heavily refactored, and
split, in future.

## Basic OCI Operations

The following files implement basic OCI operations on a runtime bundle:

### `oci_linux.go`

Defines constants, path resolution, and minimal bundle locking functions.

### `oci_runc_linux.go`

Holds implementations of the Run / Start / Exec / Kill / Delete / Pause / Resume
/ State OCI runtime operations.

See
<https://github.com/opencontainers/runtime-spec/blob/main/runtime.md#operations>

These functions are thin wrappers around the `runc`/`crun` operations of the
same name.

### `oci_conmon_linux.go`

Hold an implementation of the Create OCI runtime operation. This calls out to
`conmon`, which in turn calls `crun` or `runc`.

`conmon` is used to manage logging and console streams for containers that are
started backgrounded, so we don't have to do that ourselves.

### `oci_attach_linux.go`

Implements an `Attach` function, which can attach the user's console to the
streams of a container running in the background, which is being monitored by
conmon.

### Testing

End-to-end flows of basic OCI operations on an existing bundle are tested in the
OCI group of the e2e suite, `e2e/oci`.

## Launcher Flow

The `Launcher` type connects the standard singularity CLI actions
(run/shell/exec), to execution of an OCI container in a native bundle. Invoked
with the `--oci` flag, this is in contrast to running a Singularity format
container, with Singularity's own runtime engine.

### `spec_linux.go`

Provides a minimal OCI runtime spec, that will form the basis of container
execution that is roughly comparable to running a native singularity container
with `--compat` (`--containall`).

### `mounts_linux.go`

Provides code handling the addition of required mounts to the OCI runtime spec.

### `process_linux.go`

Provides code handling configuration of container process execution, including
user mapping.

### `launcher_linux.go`

Implements `Launcher.Exec`, which is called from the CLI layer. It will:

* Create a temporary bundle directory.
* Use `pkg/ocibundle/native` to retrieve the specified image, and extract it in
the temporary bundle.
* Configure the container by creating an appropriate runtime spec.
* Call the interactive OCI Run function to execute the container with `crun` or
`runc`.

### Namespace Considerations

An OCI container started via `Launch.Exec` as a non-root user always uses at
least one user namespace.

The user namespace is created *prior to* calling `runc` or `crun`, so we'll call
it an *outer* user namespace.

Creation of this outer user namespace is via using the `RunNS` function, instead
of `Run`. The `RunNS` function executes the Singularity `starter` binary, with a
minimal configuration of the fakeroot engine (
`internal/pkg/runtime/engine/fakeroot/config`).

The `starter` will create a user namespace and ID mapping, and will then execute
`singularity oci run` to perform the basic OCI Run operation against the bundle
that the `Launcher.Exec` function has prepared.

The outer user namespace from which `runc` or `crun` is called *always* maps the
host user id to root inside the userns.

When a container is run in `--fakeroot` mode, the outer user namespace is the
only user namespace. The OCI runtime config does not request any additional
userns or ID mapping be performed by `crun` / `runc`.

When a container is **not** run in `--fakeroot` mode, the OCI runtime config for
the bundle requests that `crun` / `runc`:

* Create another, inner, user namespace for the container.
* Apply an ID mapping which reverses the 'fakeroot' outer ID mapping.

I.E. when a container runs without `--fakeroot`, the ID mapping is:

* User ID on host (1001)
* Root in outer user namespace (0)
* User ID in container (1001)

### Testing

End-to-end testing of ßthe launcher flow is via the `e2e/actions` suite. Tests
prefixed `oci`.
32 changes: 21 additions & 11 deletions internal/pkg/runtime/launcher/oci/launcher_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,6 @@ func checkOpts(lo launcher.Options) error {
badOpt = append(badOpt, "PwdPath")
}

if lo.Fakeroot {
badOpt = append(badOpt, "Fakeroot")
}
if lo.Boot {
badOpt = append(badOpt, "Boot")
}
Expand Down Expand Up @@ -290,12 +287,18 @@ func (l *Launcher) Exec(ctx context.Context, image string, process string, args
}

spec.Process.User = l.getProcessUser()
uidMap, gidMap, err := l.getIDMaps()
if err != nil {
return err

// If we are *not* requesting fakeroot, then we need to map the container
// uid back to host uid, through the initial fakeroot userns.
if !l.cfg.Fakeroot && os.Getuid() != 0 {
uidMap, gidMap, err := l.getReverseUserMaps()
if err != nil {
return err
}
spec.Linux.UIDMappings = uidMap
spec.Linux.GIDMappings = gidMap
}
spec.Linux.UIDMappings = uidMap
spec.Linux.GIDMappings = gidMap

cwd, err := l.getProcessCwd()
if err != nil {
return err
Expand Down Expand Up @@ -328,9 +331,16 @@ func (l *Launcher) Exec(ctx context.Context, image string, process string, args
return fmt.Errorf("while generating container id: %w", err)
}

err = Run(ctx, id.String(), b.Path(), "")
if exiterr, ok := err.(*exec.ExitError); ok {
os.Exit(exiterr.ExitCode())
if os.Getuid() == 0 {
// Direct execution of runc/crun run.
err = Run(ctx, id.String(), b.Path(), "")
} else {
// Reexec singularity oci run in a userns with mappings.
err = RunNS(ctx, id.String(), b.Path(), "")
}
var exitErr *exec.ExitError
if errors.As(err, &exitErr) {
os.Exit(exitErr.ExitCode())
}
return err
}
84 changes: 48 additions & 36 deletions internal/pkg/runtime/launcher/oci/mounts_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ package oci

import (
"fmt"
"os"
"strconv"

"github.com/opencontainers/runtime-spec/specs-go"
Expand All @@ -33,17 +34,6 @@ func (l *Launcher) getMounts() ([]specs.Mount, error) {
return *mounts, nil
}

// addBindMount adds a bind mount from src on host, to dst in container.
func (l *Launcher) addBindMount(mounts *[]specs.Mount, src, dst string) {
*mounts = append(*mounts,
specs.Mount{
Source: src,
Destination: dst,
Type: "none",
Options: []string{"rbind", "nosuid", "nodev"},
})
}

// addTmpMounts adds tmpfs mounts for /tmp and /var/tmp in the container.
func (l *Launcher) addTmpMounts(mounts *[]specs.Mount) {
*mounts = append(*mounts,
Expand All @@ -63,9 +53,19 @@ func (l *Launcher) addTmpMounts(mounts *[]specs.Mount) {

// addDevMounts adds mounts to assemble a minimal /dev in the container.
func addDevMounts(mounts *[]specs.Mount) error {
group, err := user.GetGrNam("tty")
if err != nil {
return fmt.Errorf("while identifying tty gid: %w", err)
ptsMount := specs.Mount{
Destination: "/dev/pts",
Type: "devpts",
Source: "devpts",
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"},
}

if os.Getuid() == 0 {
group, err := user.GetGrNam("tty")
if err != nil {
return fmt.Errorf("while identifying tty gid: %w", err)
}
ptsMount.Options = append(ptsMount.Options, fmt.Sprintf("gid=%d", group.GID))
}

*mounts = append(*mounts,
Expand All @@ -75,12 +75,7 @@ func addDevMounts(mounts *[]specs.Mount) error {
Source: "tmpfs",
Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
},
specs.Mount{
Destination: "/dev/pts",
Type: "devpts",
Source: "devpts",
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=" + strconv.Itoa(int(group.GID))},
},
ptsMount,
specs.Mount{
Destination: "/dev/shm",
Type: "tmpfs",
Expand All @@ -100,33 +95,50 @@ func addDevMounts(mounts *[]specs.Mount) error {

// addProcMount adds the /proc tree in the container.
func (l *Launcher) addProcMount(mounts *[]specs.Mount) {
if l.cfg.Namespaces.PID {
*mounts = append(*mounts,
specs.Mount{
Source: "proc",
Destination: "/proc",
Type: "proc",
})
}

// addSysMount adds the /sys tree in the container.
func (l *Launcher) addSysMount(mounts *[]specs.Mount) {
if os.Getuid() == 0 {
*mounts = append(*mounts,
specs.Mount{
Source: "proc",
Destination: "/proc",
Type: "proc",
Source: "sysfs",
Destination: "/sys",
Type: "sysfs",
Options: []string{"nosuid", "noexec", "nodev", "ro"},
})
} else {
l.addBindMount(mounts, "/proc", "/proc")
*mounts = append(*mounts,
specs.Mount{
Source: "/sys",
Destination: "/sys",
Type: "none",
Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
})
}
}

// addSysMount adds the /sys tree in the container.
func (l *Launcher) addSysMount(mounts *[]specs.Mount) {
*mounts = append(*mounts,
specs.Mount{
Source: "sysfs",
Destination: "/sys",
Type: "sysfs",
Options: []string{"nosuid", "noexec", "nodev", "ro"},
})
}

// addHomeMount adds a user home directory as a tmpfs mount. We are currently
// emulating `--compat` / `--containall`, so the user must specifically bind in
// their home directory from the host for it to be available.
func (l *Launcher) addHomeMount(mounts *[]specs.Mount) error {
if l.cfg.Fakeroot {
*mounts = append(*mounts,
specs.Mount{
Destination: "/root",
Type: "tmpfs",
Source: "tmpfs",
Options: []string{"nosuid", "relatime", "mode=755", "size=65536k"},
})
return nil
}

pw, err := user.CurrentOriginal()
if err != nil {
return err
Expand Down
Loading

0 comments on commit f855964

Please sign in to comment.