-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
*: actually support joining a userns with a new container
(This is a cherry-pick of 1912d59.) Our handling for name space paths with user namespaces has been broken for a long time. In particular, the need to parse /proc/self/*id_map in quite a few places meant that we would treat userns configurations that had a namespace path as if they were a userns configuration without mappings, resulting in errors. The primary issue was down to the id translation helper functions, which could only handle configurations that had explicit mappings. Obviously, when joining a user namespace we need to map the ids but figuring out the correct mapping is non-trivial in comparison. In order to get the mapping, you need to read /proc/<pid>/*id_map of a process inside the userns -- while most userns paths will be of the form /proc/<pid>/ns/user (and we have a fast-path for this case), this is not guaranteed and thus it is necessary to spawn a process inside the container and read its /proc/<pid>/*id_map files in the general case. As Go does not allow us spawn a subprocess into a target userns, we have to use CGo to fork a sub-process which does the setns(2). To be honest, this is a little dodgy in regards to POSIX signal-safety(7) but since we do no allocations and we are executing in the forked context from a Go program (not a C program), it should be okay. The other alternative would be to do an expensive re-exec (a-la nsexec which would make several other bits of runc more complicated), or to use nsenter(1) which might not exist on the system and is less than ideal. Because we need to logically remap users quite a few times in runc (including in "runc init", where joining the namespace is not feasable), we cache the mapping inside the libcontainer config struct. A future patch will make sure that we stop allow invalid user configurations where a mapping is specified as well as a userns path to join. Finally, add an integration test to make sure we don't regress this again. Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
- Loading branch information
Showing
5 changed files
with
360 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#define _GNU_SOURCE | ||
#include <fcntl.h> | ||
#include <sched.h> | ||
#include <stdio.h> | ||
#include <unistd.h> | ||
#include <stdarg.h> | ||
#include <stdlib.h> | ||
|
||
/* | ||
* All of the code here is run inside an aync-signal-safe context, so we need | ||
* to be careful to not call any functions that could cause issues. In theory, | ||
* since we are a Go program, there are fewer restrictions in practice, it's | ||
* better to be safe than sorry. | ||
* | ||
* The only exception is exit, which we need to call to make sure we don't | ||
* return into runc. | ||
*/ | ||
|
||
void bail(int pipefd, const char *fmt, ...) | ||
{ | ||
va_list args; | ||
|
||
va_start(args, fmt); | ||
vdprintf(pipefd, fmt, args); | ||
va_end(args); | ||
|
||
exit(1); | ||
} | ||
|
||
int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd) | ||
{ | ||
char buffer[4096] = { 0 }; | ||
|
||
pid_t child = fork(); | ||
if (child != 0) | ||
return child; | ||
/* in child */ | ||
|
||
/* Join the target userns. */ | ||
int nsfd = open(userns_path, O_RDONLY); | ||
if (nsfd < 0) | ||
bail(errfd, "open userns path %s failed: %m", userns_path); | ||
|
||
int err = setns(nsfd, CLONE_NEWUSER); | ||
if (err < 0) | ||
bail(errfd, "setns %s failed: %m", userns_path); | ||
|
||
close(nsfd); | ||
|
||
/* Pipe the requested file contents. */ | ||
int fd = open(path, O_RDONLY); | ||
if (fd < 0) | ||
bail(errfd, "open %s in userns %s failed: %m", path, userns_path); | ||
|
||
int nread, ntotal = 0; | ||
while ((nread = read(fd, buffer, sizeof(buffer))) != 0) { | ||
if (nread < 0) | ||
bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal); | ||
ntotal += nread; | ||
|
||
int nwritten = 0; | ||
while (nwritten < nread) { | ||
int n = write(outfd, buffer, nread - nwritten); | ||
if (n < 0) | ||
bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m", | ||
nread - nwritten, path, nwritten); | ||
nwritten += n; | ||
} | ||
if (nread != nwritten) | ||
bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten); | ||
} | ||
|
||
close(fd); | ||
close(outfd); | ||
close(errfd); | ||
|
||
/* We must exit here, otherwise we would return into a forked runc. */ | ||
exit(0); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
//go:build linux | ||
|
||
package userns | ||
|
||
import ( | ||
"bufio" | ||
"bytes" | ||
"fmt" | ||
"io" | ||
"os" | ||
"unsafe" | ||
|
||
"github.com/opencontainers/runc/libcontainer/configs" | ||
"github.com/sirupsen/logrus" | ||
) | ||
|
||
/* | ||
#include <stdlib.h> | ||
extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd); | ||
*/ | ||
import "C" | ||
|
||
func parseIdmapData(data []byte) (ms []configs.IDMap, err error) { | ||
scanner := bufio.NewScanner(bytes.NewReader(data)) | ||
for scanner.Scan() { | ||
var m configs.IDMap | ||
line := scanner.Text() | ||
if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil { | ||
return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err) | ||
} | ||
ms = append(ms, m) | ||
} | ||
if err := scanner.Err(); err != nil { | ||
return nil, fmt.Errorf("parsing id map failed: %w", err) | ||
} | ||
return ms, nil | ||
} | ||
|
||
// Do something equivalent to nsenter --user=<nsPath> cat <path>, but more | ||
// efficiently. Returns the contents of the requested file from within the user | ||
// namespace. | ||
func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) { | ||
rdr, wtr, err := os.Pipe() | ||
if err != nil { | ||
return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err) | ||
} | ||
defer rdr.Close() | ||
defer wtr.Close() | ||
|
||
errRdr, errWtr, err := os.Pipe() | ||
if err != nil { | ||
return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err) | ||
} | ||
defer errRdr.Close() | ||
defer errWtr.Close() | ||
|
||
cNsPath := C.CString(nsPath) | ||
defer C.free(unsafe.Pointer(cNsPath)) | ||
cPath := C.CString(path) | ||
defer C.free(unsafe.Pointer(cPath)) | ||
|
||
childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd())) | ||
|
||
if childPid < 0 { | ||
return nil, fmt.Errorf("failed to spawn fork for userns") | ||
} else if childPid == 0 { | ||
// this should never happen | ||
panic("runc executing inside fork child -- unsafe state!") | ||
} | ||
|
||
// We are in the parent -- close the write end of the pipe before reading. | ||
wtr.Close() | ||
output, err := io.ReadAll(rdr) | ||
rdr.Close() | ||
if err != nil { | ||
return nil, fmt.Errorf("reading from userns spawn failed: %w", err) | ||
} | ||
|
||
// Ditto for the error pipe. | ||
errWtr.Close() | ||
errOutput, err := io.ReadAll(errRdr) | ||
errRdr.Close() | ||
if err != nil { | ||
return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err) | ||
} | ||
errOutput = bytes.TrimSpace(errOutput) | ||
|
||
// Clean up the child. | ||
child, err := os.FindProcess(int(childPid)) | ||
if err != nil { | ||
return nil, fmt.Errorf("could not find userns spawn process: %w", err) | ||
} | ||
state, err := child.Wait() | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err) | ||
} | ||
if !state.Success() { | ||
errStr := string(errOutput) | ||
if errStr == "" { | ||
errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode()) | ||
} | ||
return nil, fmt.Errorf("userns spawn: %s", errStr) | ||
} else if len(errOutput) > 0 { | ||
// We can just ignore weird output in the error pipe if the process | ||
// didn't bail(), but for completeness output for debugging. | ||
logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput)) | ||
} | ||
// The subprocess succeeded, return whatever it wrote to the pipe. | ||
return output, nil | ||
} | ||
|
||
func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) { | ||
var ( | ||
pid int | ||
extra rune | ||
tryFastPath bool | ||
) | ||
|
||
// nsPath is usually of the form /proc/<pid>/ns/user, which means that we | ||
// already have a pid that is part of the user namespace and thus we can | ||
// just use the pid to read from /proc/<pid>/*id_map. | ||
// | ||
// Note that Sscanf doesn't consume the whole input, so we check for any | ||
// trailing data with %c. That way, we can be sure the pattern matched | ||
// /proc/$pid/ns/user _exactly_ iff n === 1. | ||
if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 { | ||
tryFastPath = pid > 0 | ||
} | ||
|
||
for _, mapType := range []struct { | ||
name string | ||
idMap *[]configs.IDMap | ||
}{ | ||
{"uid_map", &uidMap}, | ||
{"gid_map", &gidMap}, | ||
} { | ||
var mapData []byte | ||
|
||
if tryFastPath { | ||
path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name) | ||
data, err := os.ReadFile(path) | ||
if err != nil { | ||
// Do not error out here -- we need to try the slow path if the | ||
// fast path failed. | ||
logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err) | ||
} else { | ||
mapData = data | ||
} | ||
} else { | ||
logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath) | ||
} | ||
|
||
if mapData == nil { | ||
// We have to actually join the namespace if we cannot take the | ||
// fast path. The path is resolved with respect to the child | ||
// process, so just use /proc/self. | ||
data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name) | ||
if err != nil { | ||
return nil, nil, err | ||
} | ||
mapData = data | ||
} | ||
idMap, err := parseIdmapData(mapData) | ||
if err != nil { | ||
return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err) | ||
} | ||
*mapType.idMap = idMap | ||
} | ||
|
||
return uidMap, gidMap, nil | ||
} |
Oops, something went wrong.