Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle file descriptor access control #4

Open
l0kod opened this issue Mar 6, 2015 · 1 comment
Open

Handle file descriptor access control #4

l0kod opened this issue Mar 6, 2015 · 1 comment

Comments

@l0kod
Copy link

l0kod commented Mar 6, 2015

A sandbox would greatly benefit from being able to only use a set of file descriptors/handles instead of accessing an explicit path, with seccom-bpf (e.g. write(2), fstat(2)…) and maybe later with capsicum (e.g. openat(2)).

This could also allow efficient data sharing (i.e. memfd_create(2)/seal/mmap).

cc rust-lang/rust#21936
cc rust-lang/rfcs#941
cc #2

@mstewartgallus
Copy link

Can't this be done by chrooting or pivot_rooting into an unlinked directory that was created inside a mounted tmpfs? I'm thinking something like this.

#define _GNU_SOURCE

#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <mntent.h>
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/capability.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

#include <linux/sched.h>

#ifndef __NR_execveat
#define __NR_execveat 322
#endif

#define SHELL "/bin/busybox"

#define RUNTIME_NAME "sandbox"

#define HOSTNAME "sandbox"

#define ARRAY_SIZE(A) (sizeof A / sizeof A[0U])

static char * const shell_arguments[] = { (char *)SHELL, "sh", NULL };
static char * const shell_environment[] = { "container=init", NULL };

static char fstab_config[] =
    "# <file system>    <mount point>   <type>  <options>\n"
    "tmpfs  tmp tmpfs   mkdir,nodev,noexec,nosuid\n"
    "\n"
    "# Allow connecting to X11\n"
    "/tmp/.X11-unix tmp/.X11-unix   none    mkdir,ro,bind,noexec,nosuid\n"
    "\n"
    "tmpfs  dev tmpfs   mkdir,nosuid,noexec\n"
    "\n"
    "# 3D acceleration\n"
    "/dev/dri   dev/dri none    mkdir,ro,bind,noexec,nosuid\n"
    "\n"
    "/dev/null  dev/null    none    touch,bind\n"
    "/dev/full  dev/full    none    touch,bind\n"
    "/dev/zero  dev/zero    none    touch,bind\n"
    "/dev/urandom   dev/urandom none    touch,bind\n"
    "\n"
    "/dev/tty   dev/tty none    touch,bind\n"
    "\n"
    "devpts dev/pts devpts  mkdir,ptmxmode=0666,newinstance\n"
    "\n"
    "tmpfs  run tmpfs   mkdir,nosuid,noexec\n"
    "tmpfs  run/lock    tmpfs   mkdir,nosuid,nodev,noexec\n"
    "tmpfs  run/shm tmpfs   mkdir,nosuid,nodev\n"
    "tmpfs  var tmpfs   mkdir,nosuid,noexec\n"
    "\n"
    "proc   proc    proc    mkdir,ro,nodev,noexec,nosuid\n"
    "sysfs  sys sysfs   mkdir,ro,nodev,noexec,nosuid\n"
    "\n"
    "/lib   lib none    mkdir,ro,nodev,nosuid,bind\n"
    "/lib32 lib32   none    mkdir,ro,nodev,nosuid,bind\n"
    "/lib64 lib64   none    mkdir,ro,nodev,nosuid,bind\n"
    "\n"
    "/bin   bin none    mkdir,ro,nodev,nosuid,bind\n"
    "/sbin  sbin    none    mkdir,ro,nodev,nosuid,bind\n"
    "/usr   usr none    mkdir,ro,nodev,nosuid,bind\n"
    "\n"
    "/etc   etc none    mkdir,ro,nodev,nosuid,bind\n";

static int close_leaked_fds(void);

int main(void)
{
    int errnum;

    if (-1 == close_leaked_fds()) {
        perror("close_leaked_fds");
        return EXIT_FAILURE;
    }

    int sh_fd = open(SHELL, O_CLOEXEC | O_NONBLOCK | O_NOCTTY);
    if (-1 == sh_fd) {
        perror("open");
        return EXIT_FAILURE;
    }

    uid_t uid = getuid();
    gid_t gid = getgid();

    uid_t mapped_uid = uid;
    gid_t mapped_gid = gid;

    /* Needed to do the rest of the unsharing */
    if (-1 == unshare(CLONE_NEWUSER)) {
        perror("unshare");
        return EXIT_FAILURE;
    }

    /* Prevent signals, ptracing of other processes */
    if (-1 == unshare(CLONE_NEWPID)) {
        perror("unshare");
        return EXIT_FAILURE;
    }

    /* Fork to allow for multithreading and to make the shell less
     * buggy.
     */
    {
        pid_t child = fork();
        if (-1 == child) {
            perror("fork");
            return EXIT_FAILURE;
        }


        if (child != 0) {
            siginfo_t info;
            do {
                errnum = -1 == waitid(P_PID, child, &info, WEXITED) ? errno : 0;
            } while (EINTR == errnum);
            if (errnum != 0) {
                assert(errnum != EINVAL);
                assert(errnum != ECHILD);
                assert(false);
            }
            return info.si_status;
        }
    }

    {
    int set_groups = open("/proc/self/setgroups", O_CLOEXEC | O_WRONLY);
    if (-1 == set_groups) {
        perror("open");
        return EXIT_FAILURE;
    }

        if (-1 == dprintf(set_groups, "deny\n")) {
            perror("dprintf");
            return EXIT_FAILURE;
        }

    if (-1 == close(set_groups)) {
        perror("close");
        return EXIT_FAILURE;
    }
    }

    {
        int file = open("/proc/self/uid_map", O_CLOEXEC | O_WRONLY);
        if (-1 == file) {
            perror("open");
            return EXIT_FAILURE;
        }

        if (-1 == dprintf(file, "%i %i 1\n", mapped_uid, uid)) {
            perror("dprintf");
            return EXIT_FAILURE;
        }

        if (-1 == close(file)) {
            perror("close");
            return EXIT_FAILURE;
        }
    }

    {
        int file = open("/proc/self/gid_map", O_CLOEXEC | O_WRONLY);
        if (-1 == file) {
            perror("open");
            return EXIT_FAILURE;
        }

        if (-1 == dprintf(file, "%i %i 1\n", mapped_gid, gid)) {
            perror("dprintf");
            return EXIT_FAILURE;
        }

        if (-1 == close(file)) {
            perror("close");
            return EXIT_FAILURE;
        }
    }

    if (-1 == setresgid(mapped_gid, mapped_gid, mapped_gid)) {
         perror("setresgid");
         return EXIT_FAILURE;
    }

    if (-1 == setresuid(mapped_uid, mapped_uid, mapped_uid)) {
         perror("setresuid");
         return EXIT_FAILURE;
    }

    /* With chroot prevent messing with user files */
    if (-1 == unshare(CLONE_NEWNS)) {
            perror("unshare");
            return EXIT_FAILURE;
    }

    /* We have unshare the network namespace so we can mount /proc
     * because of /proc/net
     */
    if (-1 == unshare(CLONE_NEWNET)) {
        perror("unshare");
        return EXIT_FAILURE;
    }

    if (0) {
        FILE * tmp = tmpfile();
        if (NULL == tmp) {
            perror("tmpfile");
            return EXIT_FAILURE;
        }

        {
            size_t bytes_to_write = sizeof fstab_config - 1U;
            if (fwrite(fstab_config, 1U, bytes_to_write, tmp) != sizeof fstab_config - 1U) {
                perror("fwrite");
                return EXIT_FAILURE;
            }
        }

        char tmppath[] = "/proc/self/fd/XXXXXXXXXXX";
        sprintf(tmppath, "/proc/self/fd/%i", fileno(tmp));

        FILE * fstab = setmntent(tmppath, "r");
        if (NULL == fstab) {
            perror("setmtent");
            return EXIT_FAILURE;
        }

        if (EOF == fclose(tmp)) {
            perror("fclose");
            return EXIT_FAILURE;
        }

        if (-1 == mkdir(RUNTIME_NAME, S_IRWXU)) {
            errnum = errno;
            if (errnum != EEXIST) {
                perror("mkdir");
                return EXIT_FAILURE;
            }
        }

        if (-1 == mount("tmpfs", RUNTIME_NAME, "tmpfs", 0, NULL)) {
            perror("mount");
            return EXIT_FAILURE;
        }

        if (-1 == chdir(RUNTIME_NAME)) {
            perror("chdir");
            return EXIT_FAILURE;
        }

        for (;;) {
            errno = 0;
            struct mntent * entry = getmntent(fstab);
            if (NULL == entry) {
                errnum = errno;
                if (errnum != 0) {
                    perror("getmntent");
                    return EXIT_FAILURE;
                }

                break;
            }

            enum {
                MKDIR,
                TOUCH,
                BIND,
                RBIND,
                REMOUNT,
                RO,
                RW,
                SUID,
                NOSUID,
                DEV,
                NODEV,
                EXEC,
                NOEXEC,
                USER,
                NOUSER,
                KERNMOUNT,
                ACTIVE
            };
            char * const token[] = {
                [MKDIR] = "mkdir",
                [TOUCH] = "touch",
                [BIND] = "bind",
                [RBIND] = "rbind",
                [REMOUNT] = "remount",
                [RO] = MNTOPT_RO,
                [RW] = MNTOPT_RW,
                [SUID] = MNTOPT_SUID,
                [NOSUID] = MNTOPT_NOSUID,
                [DEV] = "dev",
                [NODEV] = "nodev",
                [EXEC] = "exec",
                [NOEXEC] = "noexec",
                [USER] = "user",
                [NOUSER] = "nouser",
                [KERNMOUNT] = "kernmount",
                [ACTIVE] = "active",
                NULL
            };
            bool mkdir_flag = false;
            bool touch_flag = false;
            bool bind = false;
            bool rec = false;
            bool remount = false;
            bool readonly = false;
            bool readwrite = false;
            bool suid = true;
            bool dev = true;
            bool exec = true;
            bool user = true;
            bool kernmount = false;
            bool active = false;

            char *leftovers = NULL;

            {
                char *mnt_opts = entry->mnt_opts;

                if (0 == strcmp("none", mnt_opts)) {
                    goto mount;
                }

                char *subopts_str = strdup(mnt_opts);
                if (NULL == subopts_str) {
                    perror("strdup");
                    return EXIT_FAILURE;
                }

                char * subopts = subopts_str;

                char *value = NULL;
                while (*subopts != '\0') {
                    switch (getsubopt(&subopts, token, &value)) {
                    case MKDIR:
                        mkdir_flag = true;
                        break;

                    case TOUCH:
                        touch_flag = true;
                        break;

                    case BIND:
                        bind = true;
                        break;

                    case RBIND:
                        bind = true;
                        rec = true;
                        break;

                    case REMOUNT:
                        remount = true;
                        break;

                    case RO:
                        readonly = true;
                        break;

                    case RW:
                        readwrite = true;
                        break;

                    case SUID:
                        suid = true;
                        break;

                    case NOSUID:
                        suid = false;
                        break;

                    case DEV:
                        dev = true;
                        break;

                    case NODEV:
                        dev = false;
                        break;

                    case EXEC:
                        exec = true;
                        break;

                    case NOEXEC:
                        exec = false;
                        break;

                    case USER:
                        user = true;
                        break;

                    case NOUSER:
                        user = false;
                        break;

                    case KERNMOUNT:
                        kernmount = true;
                        break;

                    case ACTIVE:
                        active = true;
                        break;

                    default:;
                        leftovers = strstr(mnt_opts, value);
                        goto free_subopts_str;
                    }
                }

            free_subopts_str:
                free(subopts_str);
            }
        mount:
            if (bind && rec && readonly) {
                fprintf(stderr,
                    "It's not possible to recursively bind readonly mounts\n");
                return EXIT_FAILURE;
            }

            if (readwrite && readonly) {
                fprintf(stderr, "Only one of '%s' and '%s' can be specified\n",
                    token[RO], token[RW]);
                return EXIT_FAILURE;
            }

            if (mkdir_flag && touch_flag) {
                fprintf(stderr, "Only one of '%s' and '%s' can be specified\n",
                    token[MKDIR], token[TOUCH]);
                return EXIT_FAILURE;
            }

            unsigned long mountflags = 0;

            if (bind) {
                mountflags |= MS_BIND;
            }

            if (rec) {
                mountflags |= MS_REC;
            }

            if (remount) {
                mountflags |= MS_REMOUNT;
            }

            if (readonly) {
                mountflags |= MS_RDONLY;
            }

            if (!suid) {
                mountflags |= MS_NOSUID;
            }

            if (!dev) {
                mountflags |= MS_NODEV;
            }

            if (!exec) {
                mountflags |= MS_NOEXEC;
            }

            if (!user) {
                mountflags |= MS_NOUSER;
            }

            if (kernmount) {
                mountflags |= MS_KERNMOUNT;
            }

            if (active) {
                mountflags |= MS_ACTIVE;
            }

            if (mkdir_flag) {
                if (-1 == mkdir(entry->mnt_dir, S_IRWXU)) {
                    perror("mkdir");
                    return EXIT_FAILURE;
                }
            } else if (touch_flag) {
                int fd = open(entry->mnt_dir, O_EXCL | O_CREAT | O_CLOEXEC, S_IRWXU);
                if (-1 == fd) {
                    perror("open");
                    return EXIT_FAILURE;
                }
                close(fd);
            }

            if (-1 == mount(0 == strcmp("none", entry->mnt_fsname) ? NULL : entry->mnt_fsname,
                    entry->mnt_dir,
                    entry->mnt_type, mountflags,
                    leftovers)) {
                perror("mount");
                return EXIT_FAILURE;
            }

            if (bind && readonly) {
                mountflags |= MS_REMOUNT;
                if (-1 == mount(0 == strcmp("none", entry->mnt_fsname) ? NULL : entry->mnt_fsname,
                        entry->mnt_dir,
                        entry->mnt_type, mountflags,
                        leftovers)) {
                    perror("mount");
                    return EXIT_FAILURE;
                }
            }
        }

        if (endmntent(fstab) != 1) {
            perror("endmntent");
            return EXIT_FAILURE;
        }

        int old_root = open("/", O_DIRECTORY);
        if (-1 == old_root) {
            perror("open");
            return EXIT_FAILURE;
        }

        if (-1 == syscall(__NR_pivot_root, ".", ".")) {
            perror("pivot_root");
            return EXIT_FAILURE;
        }

        if (-1 == fchdir(old_root)) {
            perror("fchdir");
            return EXIT_FAILURE;
        }

        if (-1 == umount2(".", MNT_DETACH)) {
            perror("umount");
            return EXIT_FAILURE;
        }

        if (-1 == close(old_root)) {
            perror("close");
            return EXIT_FAILURE;
        }

        if (-1 == chdir("/")) {
            perror("chdir");
            return EXIT_FAILURE;
        }
    } else {
        if (-1 == mkdir(RUNTIME_NAME, S_IRWXU)) {
            errnum = errno;
            if (errnum != EEXIST) {
                perror("mkdir");
                return EXIT_FAILURE;
            }
        }

        if (-1 == mount("tmpfs", RUNTIME_NAME, "tmpfs", 0, NULL)) {
            perror("mount");
            return EXIT_FAILURE;
        }

        if (-1 == chdir(RUNTIME_NAME)) {
            perror("chdir");
            return EXIT_FAILURE;
        }

        if (-1 == mkdir("sandbox", S_IRWXU)) {
            errnum = errno;
            if (errnum != EEXIST) {
                perror("mkdir");
                return EXIT_FAILURE;
            }
        }

        int old_root = open("/", O_DIRECTORY);
        if (-1 == old_root) {
            perror("open");
            return EXIT_FAILURE;
        }

        if (-1 == syscall(__NR_pivot_root, ".", ".")) {
            perror("pivot_root");
            return EXIT_FAILURE;
        }

        if (-1 == fchdir(old_root)) {
            perror("fchdir");
            return EXIT_FAILURE;
        }

        if (-1 == umount2(".", MNT_DETACH)) {
            perror("umount");
            return EXIT_FAILURE;
        }

        if (-1 == close(old_root)) {
            perror("close");
            return EXIT_FAILURE;
        }

        if (-1 == chdir("/")) {
            perror("chdir");
            return EXIT_FAILURE;
        }

        int sandbox_fd = open("sandbox", O_CLOEXEC | O_DIRECTORY);
        if (-1 == sandbox_fd) {
            perror("open");
            return EXIT_FAILURE;
        }

        if (-1 == fchdir(sandbox_fd)) {
            perror("fchdir");
            return EXIT_FAILURE;
        }

        if (-1 == rmdir("../sandbox")) {
            perror("rmdir");
            return EXIT_FAILURE;
        }

        if (-1 == chroot(".")) {
            perror("chroot");
            return EXIT_FAILURE;
        }

        if (-1 == chdir("/")) {
            perror("chdir");
            return EXIT_FAILURE;
        }

        close(sandbox_fd);
    }

    /* Sandbox the rest of the namespaces */

    /* We can't unshare the IPC namespace because we need to share it
     * to use X11's shared memory extensions. Not sure how to disable
     * shared memory extensions.
     */
    if (-1 == unshare(CLONE_NEWIPC | CLONE_NEWUTS)) {
        perror("unshare");
        return EXIT_FAILURE;
    }

    /* Favor other processes over this process hierarchy. Only
     * superuser may lower priorities so this is not stoppable. This
     * also makes the process hierarchy nicer for the OOM killer.
     */
    if (-1 == setpriority(PRIO_PROCESS, 0, getpriority(PRIO_PROCESS, 0) + 1)) {
        perror("setpriority");
        return EXIT_FAILURE;
    }

    if (-1 == sethostname(HOSTNAME, sizeof HOSTNAME - 1U)) {
        perror("sethostname");
        return EXIT_FAILURE;
    }

    if (0) {
        if (-1 == symlink("/proc/self/fd", "/dev/fd")) {
            perror("symlink");
            return EXIT_FAILURE;
        }

        if (-1 == symlink("/proc/self/fd/0", "/dev/stdin")) {
            perror("symlink");
            return EXIT_FAILURE;
        }

        if (-1 == symlink("/proc/self/fd/1", "/dev/stdout")) {
            perror("symlink");
            return EXIT_FAILURE;
        }

        if (-1 == symlink("/proc/self/fd/2", "/dev/stderr")) {
            perror("symlink");
            return EXIT_FAILURE;
        }

        if (-1 == symlink("/run/shm", "/dev/shm")) {
            perror("symlink");
            return EXIT_FAILURE;
        }

        if (-1 == symlink("/dev/pts/ptmx", "/dev/ptmx")) {
            perror("symlink");
            return EXIT_FAILURE;
        }
    }

    /* Keep init super privileged */
    {
        pid_t child = fork();
        if (-1 == child) {
            perror("fork");
            return EXIT_FAILURE;
        }


        if (child != 0) {
            siginfo_t info;
            do {
                errnum = -1 == waitid(P_PID, child, &info, WEXITED) ? errno : 0;
            } while (EINTR == errnum);
            if (errnum != 0) {
                assert(errnum != EINVAL);
                assert(errnum != ECHILD);
                assert(false);
            }
            return info.si_status;
        }
    }

    /* In the shell drop all privileges I might possibly have. */
    cap_t caps = cap_get_proc();
    if (NULL == caps) {
        perror("cap_get_proc");
        return EXIT_FAILURE;
    }

    if (-1 == cap_clear_flag(caps, CAP_PERMITTED)) {
        perror("cap_clear_flag");
        return EXIT_FAILURE;
    }
    if (-1 == cap_clear_flag(caps, CAP_EFFECTIVE)) {
        perror("cap_clear_flag");
        return EXIT_FAILURE;
    }

    if (-1 == cap_set_proc(caps)) {
        perror("cap_set_proc");
        return EXIT_FAILURE;
    }

    if (-1 == cap_free(caps)) {
        perror("cap_free");
        return EXIT_FAILURE;
    }

    syscall(__NR_execveat, sh_fd, "",
        (char *const *)shell_arguments, shell_environment,
        AT_EMPTY_PATH);
    perror("execveat");
    return EXIT_FAILURE;
}

static int close_leaked_fds(void)
{
    int errnum = 0;
    size_t size = 0U;
    int *fds = NULL;

    DIR *const fds_dir = opendir("/proc/self/fd");
    if (NULL == fds_dir) {
        errnum = errno;
        assert(errnum != 0);
        return errnum;
    }

    for (;;) {
        errno = 0;
        struct dirent *const result = readdir(fds_dir);
        {
            errnum = errno;
            if (errnum != 0) {
                goto close_fds_dir;
            }
        }
        if (NULL == result) {
            break;
        }

        char const *const d_name = result->d_name;
        if (0 == strcmp(d_name, ".")) {
            continue;
        }

        if (0 == strcmp(d_name, "..")) {
            continue;
        }

        int const fd = atoi(d_name);
        if (fd == dirfd(fds_dir)) {
            continue;
        }

        ++size;
    }

    rewinddir(fds_dir);

    fds = calloc(size, sizeof fds[0]);
    if (size != 0U && NULL == fds) {
        errnum = errno;
        assert(errnum != 0);
        goto close_fds_dir;
    }

    for (size_t ii = 0U; ii < size;) {
        errno = 0;
        struct dirent *const result = readdir(fds_dir);
        {
            errnum = errno;
            if (errnum != 0) {
                goto close_fds_dir;
            }
        }

        char const *const d_name = result->d_name;
        if (0 == strcmp(d_name, ".")) {
            continue;
        }

        if (0 == strcmp(d_name, "..")) {
            continue;
        }

        int const fd = atoi(d_name);

        if (fd == dirfd(fds_dir)) {
            continue;
        }

        fds[ii] = fd;
        ++ii;
    }

close_fds_dir:
    if (-1 == closedir(fds_dir)) {
        int close_errnum = errno;
        assert(close_errnum != 0);
        assert(close_errnum != EBADF);

        if (0 == errnum) {
            errnum = close_errnum;
        }
    }

    if (0 == errnum) {
        for (size_t ii = 0U; ii < size; ++ii) {
            int fd = fds[ii];
            switch (fd) {
            case STDIN_FILENO:
            case STDOUT_FILENO:
            case STDERR_FILENO:
                break;

            default:
                close(fd);
                break;
            }
        }
    }

    free(fds);

    return errnum;
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants