From 052ba25769500e2a2a1a0a3f9be41a80b7d0090d Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 14 Sep 2021 08:54:35 +0200 Subject: [PATCH 01/10] Implement the seccomp profile --- .github/workflows/main.yml | 8 +- Cargo.lock | 10 + Cargo.toml | 1 + README.md | 4 +- src/lib.rs | 1 + src/process/init.rs | 22 +- src/seccomp/fixture/config.json | 972 ++++++++++++++++++++++++++++++++ src/seccomp/mod.rs | 2 + src/seccomp/seccomp.rs | 401 +++++++++++++ 9 files changed, 1407 insertions(+), 14 deletions(-) create mode 100644 src/seccomp/fixture/config.json create mode 100644 src/seccomp/mod.rs create mode 100644 src/seccomp/seccomp.rs diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 72f98fdde..38a7ea594 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -43,7 +43,7 @@ jobs: working-directory: ./cgroups - run: rustup component add rustfmt clippy - run: sudo apt-get -y update - - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Check formatting run: cargo fmt --all -- --check working-directory: ${{matrix.dirs}} @@ -68,7 +68,7 @@ jobs: with: working-directory: ./cgroups - run: sudo apt-get -y update - - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Run tests run: cargo test --all --all-features --no-fail-fast coverage: @@ -98,7 +98,7 @@ jobs: - name: Update System Libraries run: sudo apt-get -y update - name: Install System Libraries - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Run Test Coverage for youki run: | cargo llvm-cov clean --workspace @@ -143,7 +143,7 @@ jobs: with: working-directory: ./cgroups - run: sudo apt-get -y update - - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Build run: ./build.sh --release - uses: actions/setup-go@v2 diff --git a/Cargo.lock b/Cargo.lock index 2903e044a..3b6952336 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -890,6 +890,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "seccomp-sys" +version = "0.1.3" +source = "git+https://github.com/polachok/seccomp-sys.git?rev=9d89b10f9faa19e8f4e952663697ec126f2e2121#9d89b10f9faa19e8f4e952663697ec126f2e2121" +dependencies = [ + "libc", + "pkg-config", +] + [[package]] name = "serde" version = "1.0.130" @@ -1151,6 +1160,7 @@ dependencies = [ "prctl", "procfs", "quickcheck", + "seccomp-sys", "serde", "serde_json", "serial_test", diff --git a/Cargo.toml b/Cargo.toml index 856d3db63..a92a4c7a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ dbus = "0.9.2" tabwriter = "1" fastrand = "1.4.1" crossbeam-channel = "0.5" +seccomp-sys = { git = "https://github.com/polachok/seccomp-sys.git", rev = "9d89b10f9faa19e8f4e952663697ec126f2e2121"} [dev-dependencies] oci-spec = { git = "https://github.com/utam0k/oci-spec-rs/", tag = "v0.4.0-with-bugfix", features = ["proptests"] } diff --git a/README.md b/README.md index d5e42530a..52cf61971 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,8 @@ $ sudo apt-get install \ libsystemd-dev \ libdbus-glib-1-dev \ build-essential \ - libelf-dev + libelf-dev \ + libseccomp-dev ``` ### Fedora, Centos, RHEL and related distributions @@ -86,6 +87,7 @@ $ sudo dnf install \ systemd-devel \ dbus-devel \ elfutils-libelf-devel \ + libseccomp-devel ``` ## Build diff --git a/src/lib.rs b/src/lib.rs index 6a20a35fd..da320ee1e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ pub mod notify_socket; pub mod process; pub mod rootfs; pub mod rootless; +pub mod seccomp; pub mod signal; pub mod syscall; pub mod tty; diff --git a/src/process/init.rs b/src/process/init.rs index 03eb97e8d..d871a1237 100644 --- a/src/process/init.rs +++ b/src/process/init.rs @@ -1,3 +1,8 @@ +use super::args::ContainerArgs; +use crate::{ + capabilities, hooks, namespaces::Namespaces, process::channel, rootfs, rootless::Rootless, + seccomp, syscall::Syscall, tty, utils, +}; use anyhow::{bail, Context, Result}; use nix::mount::mount as nix_mount; use nix::mount::MsFlags; @@ -9,17 +14,12 @@ use nix::{ }; use oci_spec::runtime::{LinuxNamespaceType, User}; use std::collections::HashMap; -use std::{env, os::unix::io::AsRawFd}; -use std::{fs, path::Path, path::PathBuf}; - -use crate::rootless::Rootless; -use crate::{ - capabilities, hooks, namespaces::Namespaces, process::channel, rootfs, syscall::Syscall, tty, - utils, +use std::{ + env, fs, + os::unix::io::AsRawFd, + path::{Path, PathBuf}, }; -use super::args::ContainerArgs; - // Make sure a given path is on procfs. This is to avoid the security risk that // /proc path is mounted over. Ref: CVE-2019-16884 fn ensure_procfs(path: &Path) -> Result<()> { @@ -377,6 +377,10 @@ pub fn container_init( } } + // Initialize seccomp profile right before we are ready to execute the + // payload. The notify socket will still need network related syscalls. + seccomp::initialize_seccomp(linux.seccomp.as_ref()).context("Failed to execute seccomp")?; + if let Some(args) = proc.args.as_ref() { utils::do_exec(&args[0], args)?; } else { diff --git a/src/seccomp/fixture/config.json b/src/seccomp/fixture/config.json new file mode 100644 index 000000000..ec676fae6 --- /dev/null +++ b/src/seccomp/fixture/config.json @@ -0,0 +1,972 @@ +{ + "ociVersion": "1.0.1-dev", + "process": { + "terminal": false, + "user": { + "uid": 0, + "gid": 0 + }, + "args": [ + "helloworld" + ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "capabilities": { + "bounding": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "effective": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "inheritable": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "permitted": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "ambient": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ] + }, + "rlimits": [ + { + "type": "RLIMIT_NOFILE", + "hard": 1024, + "soft": 1024 + } + ], + "noNewPrivileges": true + }, + "root": { + "path": "tests/assets/oci/helloworld/rootfs" + }, + "hostname": "runc", + "mounts": [ + { + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev", + "type": "tmpfs", + "source": "tmpfs", + "options": [ + "nosuid", + "strictatime", + "mode=755", + "size=65536k" + ] + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + }, + { + "destination": "/dev/shm", + "type": "tmpfs", + "source": "shm", + "options": [ + "nosuid", + "noexec", + "nodev", + "mode=1777", + "size=65536k" + ] + }, + { + "destination": "/dev/mqueue", + "type": "mqueue", + "source": "mqueue", + "options": [ + "nosuid", + "noexec", + "nodev" + ] + }, + { + "destination": "/sys", + "type": "sysfs", + "source": "sysfs", + "options": [ + "nosuid", + "noexec", + "nodev", + "ro" + ] + }, + { + "destination": "/sys/fs/cgroup", + "type": "cgroup", + "source": "cgroup", + "options": [ + "nosuid", + "noexec", + "nodev", + "relatime", + "ro" + ] + } + ], + "linux": { + "devices": [ + { + "path": "/dev/kvm", + "type": "c", + "major": 10, + "minor": 232, + "fileMode": 666, + "uid": 0, + "gid": 36 + } + ], + "seccomp": { + "defaultAction": "SCMP_ACT_ERRNO", + "defaultErrnoRet": 1, + "archMap": [ + { + "architecture": "SCMP_ARCH_X86_64", + "subArchitectures": [ + "SCMP_ARCH_X86", + "SCMP_ARCH_X32" + ] + }, + { + "architecture": "SCMP_ARCH_AARCH64", + "subArchitectures": [ + "SCMP_ARCH_ARM" + ] + }, + { + "architecture": "SCMP_ARCH_MIPS64", + "subArchitectures": [ + "SCMP_ARCH_MIPS", + "SCMP_ARCH_MIPS64N32" + ] + }, + { + "architecture": "SCMP_ARCH_MIPS64N32", + "subArchitectures": [ + "SCMP_ARCH_MIPS", + "SCMP_ARCH_MIPS64" + ] + }, + { + "architecture": "SCMP_ARCH_MIPSEL64", + "subArchitectures": [ + "SCMP_ARCH_MIPSEL", + "SCMP_ARCH_MIPSEL64N32" + ] + }, + { + "architecture": "SCMP_ARCH_MIPSEL64N32", + "subArchitectures": [ + "SCMP_ARCH_MIPSEL", + "SCMP_ARCH_MIPSEL64" + ] + }, + { + "architecture": "SCMP_ARCH_S390X", + "subArchitectures": [ + "SCMP_ARCH_S390" + ] + } + ], + "syscalls": [ + { + "names": [ + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "bind", + "brk", + "capget", + "capset", + "chdir", + "chmod", + "chown", + "chown32", + "clock_adjtime", + "clock_adjtime64", + "clock_getres", + "clock_getres_time64", + "clock_gettime", + "clock_gettime64", + "clock_nanosleep", + "clock_nanosleep_time64", + "close", + "close_range", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "dup3", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_pwait2", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsetxattr", + "fstat", + "fstat64", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futex_time64", + "futimesat", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "get_robust_list", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "get_thread_area", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "ioctl", + "io_destroy", + "io_getevents", + "io_pgetevents", + "io_pgetevents_time64", + "ioprio_get", + "ioprio_set", + "io_setup", + "io_submit", + "io_uring_enter", + "io_uring_register", + "io_uring_setup", + "ipc", + "kill", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listxattr", + "llistxattr", + "_llseek", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "membarrier", + "memfd_create", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedreceive_time64", + "mq_timedsend", + "mq_timedsend_time64", + "mq_unlink", + "mremap", + "msgctl", + "msgget", + "msgrcv", + "msgsnd", + "msync", + "munlock", + "munlockall", + "munmap", + "nanosleep", + "newfstatat", + "_newselect", + "open", + "openat", + "openat2", + "pause", + "pidfd_open", + "pidfd_send_signal", + "pipe", + "pipe2", + "poll", + "ppoll", + "ppoll_time64", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "pselect6", + "pselect6_time64", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readlink", + "readlinkat", + "readv", + "recv", + "recvfrom", + "recvmmsg", + "recvmmsg_time64", + "recvmsg", + "remap_file_pages", + "removexattr", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_sigtimedwait_time64", + "rt_tgsigqueueinfo", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_rr_get_interval_time64", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "semtimedop_time64", + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "set_robust_list", + "setsid", + "setsockopt", + "set_thread_area", + "set_tid_address", + "setuid", + "setuid32", + "setxattr", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaltstack", + "signalfd", + "signalfd4", + "sigprocmask", + "sigreturn", + "socket", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "sysinfo", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timer_getoverrun", + "timer_gettime", + "timer_gettime64", + "timer_settime", + "timer_settime64", + "timerfd_create", + "timerfd_gettime", + "timerfd_gettime64", + "timerfd_settime", + "timerfd_settime64", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "uname", + "unlink", + "unlinkat", + "utime", + "utimensat", + "utimensat_time64", + "utimes", + "vfork", + "vmsplice", + "wait4", + "waitid", + "waitpid", + "write", + "writev" + ], + "action": "SCMP_ACT_ALLOW" + }, + { + "names": [ + "process_vm_readv", + "process_vm_writev", + "ptrace" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "minKernel": "4.8" + } + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 0, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 8, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 131072, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 131080, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 4294967295, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "sync_file_range2" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "ppc64le" + ] + } + }, + { + "names": [ + "arm_fadvise64_64", + "arm_sync_file_range", + "sync_file_range2", + "breakpoint", + "cacheflush", + "set_tls" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "arm", + "arm64" + ] + } + }, + { + "names": [ + "arch_prctl" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "amd64", + "x32" + ] + } + }, + { + "names": [ + "modify_ldt" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "amd64", + "x32", + "x86" + ] + } + }, + { + "names": [ + "s390_pci_mmio_read", + "s390_pci_mmio_write", + "s390_runtime_instr" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "s390", + "s390x" + ] + } + }, + { + "names": [ + "open_by_handle_at" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_DAC_READ_SEARCH" + ] + } + }, + { + "names": [ + "bpf", + "clone", + "clone3", + "fanotify_init", + "fsconfig", + "fsmount", + "fsopen", + "fspick", + "lookup_dcookie", + "mount", + "move_mount", + "name_to_handle_at", + "open_tree", + "perf_event_open", + "quotactl", + "setdomainname", + "sethostname", + "setns", + "syslog", + "umount", + "umount2", + "unshare" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_ADMIN" + ] + } + }, + { + "names": [ + "clone" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 2114060288, + "op": "SCMP_CMP_MASKED_EQ" + } + ], + "excludes": { + "caps": [ + "CAP_SYS_ADMIN" + ], + "arches": [ + "s390", + "s390x" + ] + } + }, + { + "names": [ + "clone" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 1, + "value": 2114060288, + "op": "SCMP_CMP_MASKED_EQ" + } + ], + "comment": "s390 parameter ordering for clone is different", + "includes": { + "arches": [ + "s390", + "s390x" + ] + }, + "excludes": { + "caps": [ + "CAP_SYS_ADMIN" + ] + } + }, + { + "names": [ + "clone3" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 38, + "excludes": { + "caps": [ + "CAP_SYS_ADMIN" + ] + } + }, + { + "names": [ + "reboot" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_BOOT" + ] + } + }, + { + "names": [ + "chroot" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_CHROOT" + ] + } + }, + { + "names": [ + "delete_module", + "init_module", + "finit_module" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_MODULE" + ] + } + }, + { + "names": [ + "acct" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_PACCT" + ] + } + }, + { + "names": [ + "kcmp", + "pidfd_getfd", + "process_madvise", + "process_vm_readv", + "process_vm_writev", + "ptrace" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_PTRACE" + ] + } + }, + { + "names": [ + "iopl", + "ioperm" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_RAWIO" + ] + } + }, + { + "names": [ + "settimeofday", + "stime", + "clock_settime" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_TIME" + ] + } + }, + { + "names": [ + "vhangup" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_TTY_CONFIG" + ] + } + }, + { + "names": [ + "get_mempolicy", + "mbind", + "set_mempolicy" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_NICE" + ] + } + }, + { + "names": [ + "syslog" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYSLOG" + ] + } + } + ] + }, + "resources": { + "devices": [ + { + "allow": true, + "access": "rwm" + } + ] + }, + "uidMappings": [ + { + "containerID": 0, + "hostID": 1000, + "size": 1 + } + ], + "gidMappings": [ + { + "containerID": 0, + "hostID": 1000, + "size": 1 + } + ], + "namespaces": [ + { + "type": "pid" + }, + { + "type": "network" + }, + { + "type": "user" + }, + { + "type": "ipc" + }, + { + "type": "uts" + }, + { + "type": "mount" + } + ], + "maskedPaths": [ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi" + ], + "readonlyPaths": [ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + } +} \ No newline at end of file diff --git a/src/seccomp/mod.rs b/src/seccomp/mod.rs new file mode 100644 index 000000000..1f85a66a3 --- /dev/null +++ b/src/seccomp/mod.rs @@ -0,0 +1,2 @@ +pub mod seccomp; +pub use seccomp::initialize_seccomp; diff --git a/src/seccomp/seccomp.rs b/src/seccomp/seccomp.rs new file mode 100644 index 000000000..0d4ce9df2 --- /dev/null +++ b/src/seccomp/seccomp.rs @@ -0,0 +1,401 @@ +use anyhow::bail; +use anyhow::Context; +use anyhow::Result; +use oci_spec::runtime::Arch; +use oci_spec::runtime::LinuxSeccomp; +use oci_spec::runtime::LinuxSeccompAction; +use oci_spec::runtime::LinuxSeccompOperator; +use seccomp_sys::scmp_arch::*; +use seccomp_sys::scmp_compare::*; +use seccomp_sys::*; +use std::ffi::CString; + +#[derive(Debug)] +pub struct Compare { + // The zero-indexed index of the syscall arguement. + arg: libc::c_uint, + op: Option, + datum_a: Option, + datum_b: Option, +} + +impl Compare { + pub fn new(args: u32) -> Self { + Compare { + arg: args as libc::c_uint, + op: None, + datum_a: None, + datum_b: None, + } + } + + pub fn op(mut self, op: scmp_compare) -> Self { + self.op = Some(op); + + self + } + + pub fn datum_a(mut self, datum: scmp_datum_t) -> Self { + self.datum_a = Some(datum); + + self + } + + pub fn datum_b(mut self, datum: scmp_datum_t) -> Self { + self.datum_b = Some(datum); + + self + } + + pub fn build(self) -> Result { + if self.op.is_some() && self.datum_a.is_some() { + Ok(scmp_arg_cmp { + arg: self.arg, + op: self.op.unwrap().into(), + datum_a: self.datum_a.unwrap(), + datum_b: self.datum_b.unwrap_or(0), + }) + } else { + bail!("op and datum_a is required: {:?}", self); + } + } +} + +#[derive(Debug)] +pub struct Rule { + action: u32, + syscall_nr: i32, + comparators: Vec, +} + +impl Rule { + pub fn new(action: u32, syscall_number: i32) -> Self { + Rule { + action, + syscall_nr: syscall_number, + comparators: vec![], + } + } + + pub fn add_comparator(&mut self, cmp: scmp_arg_cmp) { + self.comparators.push(cmp); + } +} + +#[derive(Debug)] +struct FilterContext { + ctx: *mut scmp_filter_ctx, +} + +impl FilterContext { + pub fn default(default_action: u32) -> Result { + let filter_ctx = unsafe { seccomp_init(default_action) }; + if filter_ctx.is_null() { + bail!("Failed to initialized seccomp profile") + } + + Ok(FilterContext { ctx: filter_ctx }) + } + + pub fn add_rule(&mut self, rule: &Rule) -> Result<()> { + let res = match rule.comparators.len() { + 0 => unsafe { seccomp_rule_add(self.ctx, rule.action, rule.syscall_nr, 0) }, + _ => unsafe { + seccomp_rule_add_array( + self.ctx, + rule.action, + rule.syscall_nr, + rule.comparators.len() as u32, + rule.comparators.as_slice().as_ptr(), + ) + }, + }; + if res != 0 { + bail!("Failed to add rule. Errno: {}, Rule: {:?}", res, rule); + } + + Ok(()) + } + + pub fn add_arch(&mut self, arch: u32) -> Result<()> { + let res = unsafe { seccomp_arch_add(self.ctx, arch) }; + if res != 0 { + if nix::Error::from_i32(res.abs()) != nix::Error::EEXIST { + // The architecture already existed in the profile, so we can + // safely ignore the error here. Otherwise, error out. + bail!("Failed to add architecture {}. Errno: {}", arch, res); + } + } + + Ok(()) + } + + pub fn load(&self) -> Result<()> { + let res = unsafe { seccomp_load(self.ctx) }; + if res != 0 { + bail!("Failed to load seccomp profile: {}", res); + } + + Ok(()) + } +} + +fn translate_syscall(syscall_name: String) -> Result { + let c_syscall_name = CString::new(syscall_name.as_str()) + .with_context(|| format!("Failed to convert syscall {:?} to cstring", syscall_name))?; + let res = unsafe { seccomp_syscall_resolve_name(c_syscall_name.as_ptr()) }; + if res == __NR_SCMP_ERROR { + bail!("Failed to resolve syscall from name: {:?}", syscall_name); + } + + Ok(res) +} + +fn translate_action(action: &LinuxSeccompAction, errno: Option) -> u32 { + let errno = errno.unwrap_or(libc::EPERM as u32); + match action { + LinuxSeccompAction::ScmpActKill => SCMP_ACT_KILL, + LinuxSeccompAction::ScmpActTrap => SCMP_ACT_TRAP, + LinuxSeccompAction::ScmpActErrno => SCMP_ACT_ERRNO(errno), + LinuxSeccompAction::ScmpActTrace => SCMP_ACT_TRACE(errno), + LinuxSeccompAction::ScmpActAllow => SCMP_ACT_ALLOW, + LinuxSeccompAction::ScmpActKillProcess => SCMP_ACT_KILL_PROCESS, + LinuxSeccompAction::ScmpActNotify => SCMP_ACT_NOTIFY, + LinuxSeccompAction::ScmpActLog => SCMP_ACT_LOG, + } +} + +fn translate_op(op: &LinuxSeccompOperator) -> scmp_compare { + match op { + LinuxSeccompOperator::ScmpCmpNe => SCMP_CMP_NE, + LinuxSeccompOperator::ScmpCmpLt => SCMP_CMP_LT, + LinuxSeccompOperator::ScmpCmpLe => SCMP_CMP_LE, + LinuxSeccompOperator::ScmpCmpEq => SCMP_CMP_EQ, + LinuxSeccompOperator::ScmpCmpGe => SCMP_CMP_GE, + LinuxSeccompOperator::ScmpCmpGt => SCMP_CMP_GT, + LinuxSeccompOperator::ScmpCmpMaskedEq => SCMP_CMP_MASKED_EQ, + } +} + +fn translate_arch(arch: &Arch) -> scmp_arch { + match arch { + Arch::ScmpArchNative => SCMP_ARCH_NATIVE, + Arch::ScmpArchX86 => SCMP_ARCH_X86, + Arch::ScmpArchX86_64 => SCMP_ARCH_X86_64, + Arch::ScmpArchX32 => SCMP_ARCH_X32, + Arch::ScmpArchArm => SCMP_ARCH_ARM, + Arch::ScmpArchAarch64 => SCMP_ARCH_AARCH64, + Arch::ScmpArchMips => SCMP_ARCH_MIPS, + Arch::ScmpArchMips64 => SCMP_ARCH_MIPS64, + Arch::ScmpArchMips64n32 => SCMP_ARCH_MIPS64N32, + Arch::ScmpArchMipsel => SCMP_ARCH_MIPSEL, + Arch::ScmpArchMipsel64 => SCMP_ARCH_MIPSEL64, + Arch::ScmpArchMipsel64n32 => SCMP_ARCH_MIPSEL64N32, + Arch::ScmpArchPpc => SCMP_ARCH_PPC, + Arch::ScmpArchPpc64 => SCMP_ARCH_PPC64, + Arch::ScmpArchPpc64le => SCMP_ARCH_PPC64LE, + Arch::ScmpArchS390 => SCMP_ARCH_S390, + Arch::ScmpArchS390x => SCMP_ARCH_S390X, + } +} + +pub fn initialize_seccomp(seccomp: Option<&LinuxSeccomp>) -> Result<()> { + if seccomp.is_none() { + return Ok(()); + } + + let seccomp = seccomp.unwrap(); + if seccomp.flags.is_some() { + // runc did not support this, so let's skip it for now. + bail!("seccomp flags are not yet supported"); + } + + // log::debug!("XXX seccomp: {:?}", seccomp); + + // TODO: fix default action error number. The spec repo doesn't have it yet. + let default_action = translate_action(&seccomp.default_action, None); + let mut ctx = FilterContext::default(default_action)?; + + if let Some(architectures) = seccomp.architectures.as_ref() { + for arch in architectures { + let arch_token = translate_arch(arch); + ctx.add_arch(arch_token as u32) + .context("Failed to add arch to seccomp")?; + } + } + + if let Some(syscalls) = seccomp.syscalls.as_ref() { + for syscall in syscalls { + let action = translate_action(&syscall.action, syscall.errno_ret); + if action == default_action { + // When the action is the same as the default action, the rule is redundent. We can + // skip this here to avoid failing when we add the rules. + log::warn!( + "Detect a seccomp action that is the same as the default action: {:?}", + syscall + ); + continue; + } + + for name in &syscall.names { + let ret = translate_syscall(name.clone()); + if ret.is_err() { + // If we failed to resolve the syscall by name, likely the kernel + // doeesn't support this syscall. So it is safe to skip... + log::warn!( + "Failed to resolve syscall, likely kernel doesn't support this. {:?}", + name + ); + continue; + } + + let syscall_number = translate_syscall(name.clone())?; + // Not clear why but if there are multiple arg attached to one + // syscall rule, we have to add them seperatly. add_rule will + // return EINVAL. runc does the same but doesn't explain why. + match syscall.args.as_ref() { + Some(args) => { + for arg in args { + let mut rule = Rule::new(action, syscall_number); + let cmp = Compare::new(arg.index as u32) + .op(translate_op(&arg.op)) + .datum_a(arg.value) + .datum_b(arg.value_two.unwrap_or(0)) + .build() + .context("Failed to build a seccomp compare rule")?; + rule.add_comparator(cmp); + ctx.add_rule(&rule).with_context(|| { + format!( + "Failed to add seccomp rule: {:?}. Syscall: {:?}", + &rule, name, + ) + })?; + } + } + None => { + let rule = Rule::new(action, syscall_number); + ctx.add_rule(&rule).with_context(|| { + format!( + "Failed to add seccomp rule: {:?}. Syscall: {:?}", + &rule, name, + ) + })?; + } + } + } + } + } + + let _ = prctl::set_no_new_privileges(true); + ctx.load().context("Failed to load seccomp context")?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use anyhow::Result; + use mio::unix::pipe; + use nix::sys::wait; + use oci_spec::runtime::{Arch, LinuxSeccomp, LinuxSyscall}; + use serial_test::serial; + use std::io::Read; + use std::io::Write; + use std::os::unix::prelude::AsRawFd; + use std::path; + + #[test] + #[serial] + fn test_basic() -> Result<()> { + // Note: seccomp profile is really hard to write unit test for. First, + // we can't really test default error or kill action, since rust test + // actually replies on certain syscalls. Second, some of the syscall + // will not return errorno. These syscalls will just send an abort + // signal or even just segfaults. Here we choose to use `getcwd` + // syscall for testing. This is more of a sanity check. + + let mut seccomp_profile = LinuxSeccomp::default(); + seccomp_profile.default_action = LinuxSeccompAction::ScmpActAllow; + seccomp_profile.architectures = Some(vec![Arch::ScmpArchNative]); + let mut seccomp_syscall = LinuxSyscall::default(); + // Here, we choose an error that getcwd call would never return on its own, so + // we can make sure that getcwd failed because of seccomp rule. + let expect_error = libc::EAGAIN; + seccomp_syscall.names = vec![String::from("getcwd"), String::from("setuid")]; + seccomp_syscall.action = LinuxSeccompAction::ScmpActErrno; + seccomp_syscall.errno_ret = Some(expect_error as u32); + seccomp_profile.syscalls = Some(vec![seccomp_syscall]); + + // Since Rust cargo test uses a single process to execute all tests, it + // is a good idea to fork a child process to test the seccomp profile, + // and then kill the process. This way, the main test process is + // unaffected. The child process will pass the returned error code + // to the parent for assert and checking. + let (mut sender, mut receiver) = pipe::new()?; + receiver + .set_nonblocking(false) + .with_context(|| "Failed to set channel receiver to blocking")?; + + match unsafe { nix::unistd::fork()? } { + nix::unistd::ForkResult::Parent { child } => { + nix::unistd::close(sender.as_raw_fd())?; + let mut buf = [0; 4]; + receiver + .read_exact(&mut buf) + .context("Failed to wait from child")?; + assert_eq!(i32::from_be_bytes(buf), expect_error); + wait::waitpid(child, None)?; + } + nix::unistd::ForkResult::Child => { + nix::unistd::close(receiver.as_raw_fd())?; + initialize_seccomp(Some(&seccomp_profile))?; + let ret = nix::unistd::getcwd(); + let errno: i32 = if ret.is_err() { + ret.err().unwrap() as i32 + } else { + 0 + }; + sender.write_all(&errno.to_be_bytes())?; + std::process::exit(errno); + } + } + + Ok(()) + } + + #[test] + #[serial] + fn test_moby() -> Result<()> { + let fixture_path = + path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/seccomp/fixture/config.json"); + let spec = oci_spec::runtime::Spec::load(fixture_path) + .context("Failed to load test spec for seccomp")?; + + // We know linux and seccomp exist, so let's just unwrap. + let seccomp_profile = spec.linux.unwrap().seccomp.unwrap(); + match unsafe { nix::unistd::fork()? } { + nix::unistd::ForkResult::Parent { child } => { + let status = wait::waitpid(child, None)?; + match status { + wait::WaitStatus::Exited(_, exit_code) => { + assert_eq!( + exit_code, 0, + "Child process didn't configure seccomp profile correctly" + ); + } + _ => { + bail!("Child process failed to exit correctly: {:?}", status); + } + } + } + nix::unistd::ForkResult::Child => { + let ret = initialize_seccomp(Some(&seccomp_profile)); + let exit_code = if ret.is_ok() { 0 } else { -1 }; + std::process::exit(exit_code); + } + } + + Ok(()) + } +} From 35c31145ade8bf02bbeb29e8fbfd690248a2e5de Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 14 Sep 2021 08:55:01 +0200 Subject: [PATCH 02/10] enable seccomp integration test --- integration_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_test.sh b/integration_test.sh index 6db7ceb24..fa362bfc7 100755 --- a/integration_test.sh +++ b/integration_test.sh @@ -48,7 +48,7 @@ test_cases=( # "linux_process_apparmor_profile/linux_process_apparmor_profile.t" "linux_readonly_paths/linux_readonly_paths.t" # "linux_rootfs_propagation/linux_rootfs_propagation.t" - # "linux_seccomp/linux_seccomp.t" + "linux_seccomp/linux_seccomp.t" "linux_sysctl/linux_sysctl.t" "linux_uid_mappings/linux_uid_mappings.t" "misc_props/misc_props.t" From 04ea0c4cabc68db556e16b9a4592e36fca78fa9d Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 14 Sep 2021 08:57:41 +0200 Subject: [PATCH 03/10] combine two args to test the multiple args case --- src/seccomp/fixture/config.json | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/seccomp/fixture/config.json b/src/seccomp/fixture/config.json index ec676fae6..04c6764a2 100644 --- a/src/seccomp/fixture/config.json +++ b/src/seccomp/fixture/config.json @@ -561,15 +561,7 @@ "index": 0, "value": 0, "op": "SCMP_CMP_EQ" - } - ] - }, - { - "names": [ - "personality" - ], - "action": "SCMP_ACT_ALLOW", - "args": [ + }, { "index": 0, "value": 8, From bfbf52b9a8ee78c1dcb31a3c95898af92dc0e57f Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 14 Sep 2021 10:38:42 +0200 Subject: [PATCH 04/10] fix: seccomp should not automatically set no new privilege --- src/seccomp/seccomp.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/seccomp/seccomp.rs b/src/seccomp/seccomp.rs index 0d4ce9df2..8b3ff2b4b 100644 --- a/src/seccomp/seccomp.rs +++ b/src/seccomp/seccomp.rs @@ -224,6 +224,21 @@ pub fn initialize_seccomp(seccomp: Option<&LinuxSeccomp>) -> Result<()> { } } + // The SCMP_FLTATR_CTL_NNP controls if the seccomp load function will set + // the new privilege bit automatically in prctl. Normally this is a good + // thing, but for us we need better control. Based on the spec, if OCI + // runtime spec doesn't set the no new privileges in Process, we should not + // set it here. If the seccomp load operation fails without enough + // privilege, so be it. To prevent this automatic behavior, we unset the + // value here. + let ret = unsafe { seccomp_attr_set(ctx.ctx, scmp_filter_attr::SCMP_FLTATR_CTL_NNP, 0) }; + if ret != 0 { + bail!( + "Failed to unset the no new privileges bit for seccomp: {}", + ret + ); + } + if let Some(syscalls) = seccomp.syscalls.as_ref() { for syscall in syscalls { let action = translate_action(&syscall.action, syscall.errno_ret); @@ -286,7 +301,10 @@ pub fn initialize_seccomp(seccomp: Option<&LinuxSeccomp>) -> Result<()> { } } - let _ = prctl::set_no_new_privileges(true); + // In order to use the SECCOMP_SET_MODE_FILTER operation, either the calling + // thread must have the CAP_SYS_ADMIN capability in its user namespace, or + // the thread must already have the no_new_privs bit set. + // Ref: https://man7.org/linux/man-pages/man2/seccomp.2.html ctx.load().context("Failed to load seccomp context")?; Ok(()) @@ -349,6 +367,7 @@ mod tests { } nix::unistd::ForkResult::Child => { nix::unistd::close(receiver.as_raw_fd())?; + let _ = prctl::set_no_new_privileges(true); initialize_seccomp(Some(&seccomp_profile))?; let ret = nix::unistd::getcwd(); let errno: i32 = if ret.is_err() { @@ -390,6 +409,7 @@ mod tests { } } nix::unistd::ForkResult::Child => { + let _ = prctl::set_no_new_privileges(true); let ret = initialize_seccomp(Some(&seccomp_profile)); let exit_code = if ret.is_ok() { 0 } else { -1 }; std::process::exit(exit_code); From 2317cdff1f2238333f3af8539932670d21591c53 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 14 Sep 2021 11:01:11 +0200 Subject: [PATCH 05/10] clippy happy --- src/seccomp/mod.rs | 425 ++++++++++++++++++++++++++++++++++++++++- src/seccomp/seccomp.rs | 421 ---------------------------------------- 2 files changed, 423 insertions(+), 423 deletions(-) delete mode 100644 src/seccomp/seccomp.rs diff --git a/src/seccomp/mod.rs b/src/seccomp/mod.rs index 1f85a66a3..bb1217675 100644 --- a/src/seccomp/mod.rs +++ b/src/seccomp/mod.rs @@ -1,2 +1,423 @@ -pub mod seccomp; -pub use seccomp::initialize_seccomp; +use anyhow::bail; +use anyhow::Context; +use anyhow::Result; +use oci_spec::runtime::Arch; +use oci_spec::runtime::LinuxSeccomp; +use oci_spec::runtime::LinuxSeccompAction; +use oci_spec::runtime::LinuxSeccompOperator; +use seccomp_sys::scmp_arch::*; +use seccomp_sys::scmp_compare::*; +use seccomp_sys::*; +use std::ffi::CString; + +#[derive(Debug)] +struct Compare { + // The zero-indexed index of the syscall arguement. + arg: libc::c_uint, + op: Option, + datum_a: Option, + datum_b: Option, +} + +impl Compare { + pub fn new(args: u32) -> Self { + Compare { + arg: args as libc::c_uint, + op: None, + datum_a: None, + datum_b: None, + } + } + + pub fn op(mut self, op: scmp_compare) -> Self { + self.op = Some(op); + + self + } + + pub fn datum_a(mut self, datum: scmp_datum_t) -> Self { + self.datum_a = Some(datum); + + self + } + + pub fn datum_b(mut self, datum: scmp_datum_t) -> Self { + self.datum_b = Some(datum); + + self + } + + pub fn build(self) -> Result { + if self.op.is_some() && self.datum_a.is_some() { + Ok(scmp_arg_cmp { + arg: self.arg, + op: self.op.unwrap(), + datum_a: self.datum_a.unwrap(), + datum_b: self.datum_b.unwrap_or(0), + }) + } else { + bail!("op and datum_a is required: {:?}", self); + } + } +} + +#[derive(Debug)] +struct Rule { + action: u32, + syscall_nr: i32, + comparators: Vec, +} + +impl Rule { + pub fn new(action: u32, syscall_number: i32) -> Self { + Rule { + action, + syscall_nr: syscall_number, + comparators: vec![], + } + } + + pub fn add_comparator(&mut self, cmp: scmp_arg_cmp) { + self.comparators.push(cmp); + } +} + +#[derive(Debug)] +struct FilterContext { + ctx: *mut scmp_filter_ctx, +} + +impl FilterContext { + pub fn default(default_action: u32) -> Result { + let filter_ctx = unsafe { seccomp_init(default_action) }; + if filter_ctx.is_null() { + bail!("Failed to initialized seccomp profile") + } + + Ok(FilterContext { ctx: filter_ctx }) + } + + pub fn add_rule(&mut self, rule: &Rule) -> Result<()> { + let res = match rule.comparators.len() { + 0 => unsafe { seccomp_rule_add(self.ctx, rule.action, rule.syscall_nr, 0) }, + _ => unsafe { + seccomp_rule_add_array( + self.ctx, + rule.action, + rule.syscall_nr, + rule.comparators.len() as u32, + rule.comparators.as_slice().as_ptr(), + ) + }, + }; + if res != 0 { + bail!("Failed to add rule. Errno: {}, Rule: {:?}", res, rule); + } + + Ok(()) + } + + pub fn add_arch(&mut self, arch: u32) -> Result<()> { + let res = unsafe { seccomp_arch_add(self.ctx, arch) }; + if res != 0 && nix::Error::from_i32(res.abs()) != nix::Error::EEXIST { + // The architecture already existed in the profile, so we can + // safely ignore the error here. Otherwise, error out. + bail!("Failed to add architecture {}. Errno: {}", arch, res); + } + + Ok(()) + } + + pub fn load(&self) -> Result<()> { + let res = unsafe { seccomp_load(self.ctx) }; + if res != 0 { + bail!("Failed to load seccomp profile: {}", res); + } + + Ok(()) + } +} + +fn translate_syscall(syscall_name: String) -> Result { + let c_syscall_name = CString::new(syscall_name.as_str()) + .with_context(|| format!("Failed to convert syscall {:?} to cstring", syscall_name))?; + let res = unsafe { seccomp_syscall_resolve_name(c_syscall_name.as_ptr()) }; + if res == __NR_SCMP_ERROR { + bail!("Failed to resolve syscall from name: {:?}", syscall_name); + } + + Ok(res) +} + +fn translate_action(action: &LinuxSeccompAction, errno: Option) -> u32 { + let errno = errno.unwrap_or(libc::EPERM as u32); + match action { + LinuxSeccompAction::ScmpActKill => SCMP_ACT_KILL, + LinuxSeccompAction::ScmpActTrap => SCMP_ACT_TRAP, + LinuxSeccompAction::ScmpActErrno => SCMP_ACT_ERRNO(errno), + LinuxSeccompAction::ScmpActTrace => SCMP_ACT_TRACE(errno), + LinuxSeccompAction::ScmpActAllow => SCMP_ACT_ALLOW, + LinuxSeccompAction::ScmpActKillProcess => SCMP_ACT_KILL_PROCESS, + LinuxSeccompAction::ScmpActNotify => SCMP_ACT_NOTIFY, + LinuxSeccompAction::ScmpActLog => SCMP_ACT_LOG, + } +} + +fn translate_op(op: &LinuxSeccompOperator) -> scmp_compare { + match op { + LinuxSeccompOperator::ScmpCmpNe => SCMP_CMP_NE, + LinuxSeccompOperator::ScmpCmpLt => SCMP_CMP_LT, + LinuxSeccompOperator::ScmpCmpLe => SCMP_CMP_LE, + LinuxSeccompOperator::ScmpCmpEq => SCMP_CMP_EQ, + LinuxSeccompOperator::ScmpCmpGe => SCMP_CMP_GE, + LinuxSeccompOperator::ScmpCmpGt => SCMP_CMP_GT, + LinuxSeccompOperator::ScmpCmpMaskedEq => SCMP_CMP_MASKED_EQ, + } +} + +fn translate_arch(arch: &Arch) -> scmp_arch { + match arch { + Arch::ScmpArchNative => SCMP_ARCH_NATIVE, + Arch::ScmpArchX86 => SCMP_ARCH_X86, + Arch::ScmpArchX86_64 => SCMP_ARCH_X86_64, + Arch::ScmpArchX32 => SCMP_ARCH_X32, + Arch::ScmpArchArm => SCMP_ARCH_ARM, + Arch::ScmpArchAarch64 => SCMP_ARCH_AARCH64, + Arch::ScmpArchMips => SCMP_ARCH_MIPS, + Arch::ScmpArchMips64 => SCMP_ARCH_MIPS64, + Arch::ScmpArchMips64n32 => SCMP_ARCH_MIPS64N32, + Arch::ScmpArchMipsel => SCMP_ARCH_MIPSEL, + Arch::ScmpArchMipsel64 => SCMP_ARCH_MIPSEL64, + Arch::ScmpArchMipsel64n32 => SCMP_ARCH_MIPSEL64N32, + Arch::ScmpArchPpc => SCMP_ARCH_PPC, + Arch::ScmpArchPpc64 => SCMP_ARCH_PPC64, + Arch::ScmpArchPpc64le => SCMP_ARCH_PPC64LE, + Arch::ScmpArchS390 => SCMP_ARCH_S390, + Arch::ScmpArchS390x => SCMP_ARCH_S390X, + } +} + +pub fn initialize_seccomp(seccomp: Option<&LinuxSeccomp>) -> Result<()> { + if seccomp.is_none() { + return Ok(()); + } + + let seccomp = seccomp.unwrap(); + if seccomp.flags.is_some() { + // runc did not support this, so let's skip it for now. + bail!("seccomp flags are not yet supported"); + } + + // log::debug!("XXX seccomp: {:?}", seccomp); + + // TODO: fix default action error number. The spec repo doesn't have it yet. + let default_action = translate_action(&seccomp.default_action, None); + let mut ctx = FilterContext::default(default_action)?; + + if let Some(architectures) = seccomp.architectures.as_ref() { + for arch in architectures { + let arch_token = translate_arch(arch); + ctx.add_arch(arch_token as u32) + .context("Failed to add arch to seccomp")?; + } + } + + // The SCMP_FLTATR_CTL_NNP controls if the seccomp load function will set + // the new privilege bit automatically in prctl. Normally this is a good + // thing, but for us we need better control. Based on the spec, if OCI + // runtime spec doesn't set the no new privileges in Process, we should not + // set it here. If the seccomp load operation fails without enough + // privilege, so be it. To prevent this automatic behavior, we unset the + // value here. + let ret = unsafe { seccomp_attr_set(ctx.ctx, scmp_filter_attr::SCMP_FLTATR_CTL_NNP, 0) }; + if ret != 0 { + bail!( + "Failed to unset the no new privileges bit for seccomp: {}", + ret + ); + } + + if let Some(syscalls) = seccomp.syscalls.as_ref() { + for syscall in syscalls { + let action = translate_action(&syscall.action, syscall.errno_ret); + if action == default_action { + // When the action is the same as the default action, the rule is redundent. We can + // skip this here to avoid failing when we add the rules. + log::warn!( + "Detect a seccomp action that is the same as the default action: {:?}", + syscall + ); + continue; + } + + for name in &syscall.names { + let ret = translate_syscall(name.clone()); + if ret.is_err() { + // If we failed to resolve the syscall by name, likely the kernel + // doeesn't support this syscall. So it is safe to skip... + log::warn!( + "Failed to resolve syscall, likely kernel doesn't support this. {:?}", + name + ); + continue; + } + + let syscall_number = translate_syscall(name.clone())?; + // Not clear why but if there are multiple arg attached to one + // syscall rule, we have to add them seperatly. add_rule will + // return EINVAL. runc does the same but doesn't explain why. + match syscall.args.as_ref() { + Some(args) => { + for arg in args { + let mut rule = Rule::new(action, syscall_number); + let cmp = Compare::new(arg.index as u32) + .op(translate_op(&arg.op)) + .datum_a(arg.value) + .datum_b(arg.value_two.unwrap_or(0)) + .build() + .context("Failed to build a seccomp compare rule")?; + rule.add_comparator(cmp); + ctx.add_rule(&rule).with_context(|| { + format!( + "Failed to add seccomp rule: {:?}. Syscall: {:?}", + &rule, name, + ) + })?; + } + } + None => { + let rule = Rule::new(action, syscall_number); + ctx.add_rule(&rule).with_context(|| { + format!( + "Failed to add seccomp rule: {:?}. Syscall: {:?}", + &rule, name, + ) + })?; + } + } + } + } + } + + // In order to use the SECCOMP_SET_MODE_FILTER operation, either the calling + // thread must have the CAP_SYS_ADMIN capability in its user namespace, or + // the thread must already have the no_new_privs bit set. + // Ref: https://man7.org/linux/man-pages/man2/seccomp.2.html + ctx.load().context("Failed to load seccomp context")?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use anyhow::Result; + use mio::unix::pipe; + use nix::sys::wait; + use oci_spec::runtime::{Arch, LinuxSeccomp, LinuxSyscall}; + use serial_test::serial; + use std::io::Read; + use std::io::Write; + use std::os::unix::prelude::AsRawFd; + use std::path; + + #[test] + #[serial] + fn test_basic() -> Result<()> { + // Note: seccomp profile is really hard to write unit test for. First, + // we can't really test default error or kill action, since rust test + // actually replies on certain syscalls. Second, some of the syscall + // will not return errorno. These syscalls will just send an abort + // signal or even just segfaults. Here we choose to use `getcwd` + // syscall for testing. This is more of a sanity check. + + // Here, we choose an error that getcwd call would never return on its own, so + // we can make sure that getcwd failed because of seccomp rule. + let expect_error = libc::EAGAIN; + + let seccomp_profile = LinuxSeccomp { + default_action: LinuxSeccompAction::ScmpActAllow, + architectures: Some(vec![Arch::ScmpArchNative]), + flags: None, + syscalls: Some(vec![LinuxSyscall { + names: vec![String::from("getcwd"), String::from("setuid")], + action: LinuxSeccompAction::ScmpActErrno, + errno_ret: Some(expect_error as u32), + args: None, + }]), + }; + + // Since Rust cargo test uses a single process to execute all tests, it + // is a good idea to fork a child process to test the seccomp profile, + // and then kill the process. This way, the main test process is + // unaffected. The child process will pass the returned error code + // to the parent for assert and checking. + let (mut sender, mut receiver) = pipe::new()?; + receiver + .set_nonblocking(false) + .with_context(|| "Failed to set channel receiver to blocking")?; + + match unsafe { nix::unistd::fork()? } { + nix::unistd::ForkResult::Parent { child } => { + nix::unistd::close(sender.as_raw_fd())?; + let mut buf = [0; 4]; + receiver + .read_exact(&mut buf) + .context("Failed to wait from child")?; + assert_eq!(i32::from_be_bytes(buf), expect_error); + wait::waitpid(child, None)?; + } + nix::unistd::ForkResult::Child => { + nix::unistd::close(receiver.as_raw_fd())?; + let _ = prctl::set_no_new_privileges(true); + initialize_seccomp(Some(&seccomp_profile))?; + let ret = nix::unistd::getcwd(); + let errno: i32 = if ret.is_err() { + ret.err().unwrap() as i32 + } else { + 0 + }; + sender.write_all(&errno.to_be_bytes())?; + std::process::exit(errno); + } + } + + Ok(()) + } + + #[test] + #[serial] + fn test_moby() -> Result<()> { + let fixture_path = + path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/seccomp/fixture/config.json"); + let spec = oci_spec::runtime::Spec::load(fixture_path) + .context("Failed to load test spec for seccomp")?; + + // We know linux and seccomp exist, so let's just unwrap. + let seccomp_profile = spec.linux.unwrap().seccomp.unwrap(); + match unsafe { nix::unistd::fork()? } { + nix::unistd::ForkResult::Parent { child } => { + let status = wait::waitpid(child, None)?; + match status { + wait::WaitStatus::Exited(_, exit_code) => { + assert_eq!( + exit_code, 0, + "Child process didn't configure seccomp profile correctly" + ); + } + _ => { + bail!("Child process failed to exit correctly: {:?}", status); + } + } + } + nix::unistd::ForkResult::Child => { + let _ = prctl::set_no_new_privileges(true); + let ret = initialize_seccomp(Some(&seccomp_profile)); + let exit_code = if ret.is_ok() { 0 } else { -1 }; + std::process::exit(exit_code); + } + } + + Ok(()) + } +} diff --git a/src/seccomp/seccomp.rs b/src/seccomp/seccomp.rs deleted file mode 100644 index 8b3ff2b4b..000000000 --- a/src/seccomp/seccomp.rs +++ /dev/null @@ -1,421 +0,0 @@ -use anyhow::bail; -use anyhow::Context; -use anyhow::Result; -use oci_spec::runtime::Arch; -use oci_spec::runtime::LinuxSeccomp; -use oci_spec::runtime::LinuxSeccompAction; -use oci_spec::runtime::LinuxSeccompOperator; -use seccomp_sys::scmp_arch::*; -use seccomp_sys::scmp_compare::*; -use seccomp_sys::*; -use std::ffi::CString; - -#[derive(Debug)] -pub struct Compare { - // The zero-indexed index of the syscall arguement. - arg: libc::c_uint, - op: Option, - datum_a: Option, - datum_b: Option, -} - -impl Compare { - pub fn new(args: u32) -> Self { - Compare { - arg: args as libc::c_uint, - op: None, - datum_a: None, - datum_b: None, - } - } - - pub fn op(mut self, op: scmp_compare) -> Self { - self.op = Some(op); - - self - } - - pub fn datum_a(mut self, datum: scmp_datum_t) -> Self { - self.datum_a = Some(datum); - - self - } - - pub fn datum_b(mut self, datum: scmp_datum_t) -> Self { - self.datum_b = Some(datum); - - self - } - - pub fn build(self) -> Result { - if self.op.is_some() && self.datum_a.is_some() { - Ok(scmp_arg_cmp { - arg: self.arg, - op: self.op.unwrap().into(), - datum_a: self.datum_a.unwrap(), - datum_b: self.datum_b.unwrap_or(0), - }) - } else { - bail!("op and datum_a is required: {:?}", self); - } - } -} - -#[derive(Debug)] -pub struct Rule { - action: u32, - syscall_nr: i32, - comparators: Vec, -} - -impl Rule { - pub fn new(action: u32, syscall_number: i32) -> Self { - Rule { - action, - syscall_nr: syscall_number, - comparators: vec![], - } - } - - pub fn add_comparator(&mut self, cmp: scmp_arg_cmp) { - self.comparators.push(cmp); - } -} - -#[derive(Debug)] -struct FilterContext { - ctx: *mut scmp_filter_ctx, -} - -impl FilterContext { - pub fn default(default_action: u32) -> Result { - let filter_ctx = unsafe { seccomp_init(default_action) }; - if filter_ctx.is_null() { - bail!("Failed to initialized seccomp profile") - } - - Ok(FilterContext { ctx: filter_ctx }) - } - - pub fn add_rule(&mut self, rule: &Rule) -> Result<()> { - let res = match rule.comparators.len() { - 0 => unsafe { seccomp_rule_add(self.ctx, rule.action, rule.syscall_nr, 0) }, - _ => unsafe { - seccomp_rule_add_array( - self.ctx, - rule.action, - rule.syscall_nr, - rule.comparators.len() as u32, - rule.comparators.as_slice().as_ptr(), - ) - }, - }; - if res != 0 { - bail!("Failed to add rule. Errno: {}, Rule: {:?}", res, rule); - } - - Ok(()) - } - - pub fn add_arch(&mut self, arch: u32) -> Result<()> { - let res = unsafe { seccomp_arch_add(self.ctx, arch) }; - if res != 0 { - if nix::Error::from_i32(res.abs()) != nix::Error::EEXIST { - // The architecture already existed in the profile, so we can - // safely ignore the error here. Otherwise, error out. - bail!("Failed to add architecture {}. Errno: {}", arch, res); - } - } - - Ok(()) - } - - pub fn load(&self) -> Result<()> { - let res = unsafe { seccomp_load(self.ctx) }; - if res != 0 { - bail!("Failed to load seccomp profile: {}", res); - } - - Ok(()) - } -} - -fn translate_syscall(syscall_name: String) -> Result { - let c_syscall_name = CString::new(syscall_name.as_str()) - .with_context(|| format!("Failed to convert syscall {:?} to cstring", syscall_name))?; - let res = unsafe { seccomp_syscall_resolve_name(c_syscall_name.as_ptr()) }; - if res == __NR_SCMP_ERROR { - bail!("Failed to resolve syscall from name: {:?}", syscall_name); - } - - Ok(res) -} - -fn translate_action(action: &LinuxSeccompAction, errno: Option) -> u32 { - let errno = errno.unwrap_or(libc::EPERM as u32); - match action { - LinuxSeccompAction::ScmpActKill => SCMP_ACT_KILL, - LinuxSeccompAction::ScmpActTrap => SCMP_ACT_TRAP, - LinuxSeccompAction::ScmpActErrno => SCMP_ACT_ERRNO(errno), - LinuxSeccompAction::ScmpActTrace => SCMP_ACT_TRACE(errno), - LinuxSeccompAction::ScmpActAllow => SCMP_ACT_ALLOW, - LinuxSeccompAction::ScmpActKillProcess => SCMP_ACT_KILL_PROCESS, - LinuxSeccompAction::ScmpActNotify => SCMP_ACT_NOTIFY, - LinuxSeccompAction::ScmpActLog => SCMP_ACT_LOG, - } -} - -fn translate_op(op: &LinuxSeccompOperator) -> scmp_compare { - match op { - LinuxSeccompOperator::ScmpCmpNe => SCMP_CMP_NE, - LinuxSeccompOperator::ScmpCmpLt => SCMP_CMP_LT, - LinuxSeccompOperator::ScmpCmpLe => SCMP_CMP_LE, - LinuxSeccompOperator::ScmpCmpEq => SCMP_CMP_EQ, - LinuxSeccompOperator::ScmpCmpGe => SCMP_CMP_GE, - LinuxSeccompOperator::ScmpCmpGt => SCMP_CMP_GT, - LinuxSeccompOperator::ScmpCmpMaskedEq => SCMP_CMP_MASKED_EQ, - } -} - -fn translate_arch(arch: &Arch) -> scmp_arch { - match arch { - Arch::ScmpArchNative => SCMP_ARCH_NATIVE, - Arch::ScmpArchX86 => SCMP_ARCH_X86, - Arch::ScmpArchX86_64 => SCMP_ARCH_X86_64, - Arch::ScmpArchX32 => SCMP_ARCH_X32, - Arch::ScmpArchArm => SCMP_ARCH_ARM, - Arch::ScmpArchAarch64 => SCMP_ARCH_AARCH64, - Arch::ScmpArchMips => SCMP_ARCH_MIPS, - Arch::ScmpArchMips64 => SCMP_ARCH_MIPS64, - Arch::ScmpArchMips64n32 => SCMP_ARCH_MIPS64N32, - Arch::ScmpArchMipsel => SCMP_ARCH_MIPSEL, - Arch::ScmpArchMipsel64 => SCMP_ARCH_MIPSEL64, - Arch::ScmpArchMipsel64n32 => SCMP_ARCH_MIPSEL64N32, - Arch::ScmpArchPpc => SCMP_ARCH_PPC, - Arch::ScmpArchPpc64 => SCMP_ARCH_PPC64, - Arch::ScmpArchPpc64le => SCMP_ARCH_PPC64LE, - Arch::ScmpArchS390 => SCMP_ARCH_S390, - Arch::ScmpArchS390x => SCMP_ARCH_S390X, - } -} - -pub fn initialize_seccomp(seccomp: Option<&LinuxSeccomp>) -> Result<()> { - if seccomp.is_none() { - return Ok(()); - } - - let seccomp = seccomp.unwrap(); - if seccomp.flags.is_some() { - // runc did not support this, so let's skip it for now. - bail!("seccomp flags are not yet supported"); - } - - // log::debug!("XXX seccomp: {:?}", seccomp); - - // TODO: fix default action error number. The spec repo doesn't have it yet. - let default_action = translate_action(&seccomp.default_action, None); - let mut ctx = FilterContext::default(default_action)?; - - if let Some(architectures) = seccomp.architectures.as_ref() { - for arch in architectures { - let arch_token = translate_arch(arch); - ctx.add_arch(arch_token as u32) - .context("Failed to add arch to seccomp")?; - } - } - - // The SCMP_FLTATR_CTL_NNP controls if the seccomp load function will set - // the new privilege bit automatically in prctl. Normally this is a good - // thing, but for us we need better control. Based on the spec, if OCI - // runtime spec doesn't set the no new privileges in Process, we should not - // set it here. If the seccomp load operation fails without enough - // privilege, so be it. To prevent this automatic behavior, we unset the - // value here. - let ret = unsafe { seccomp_attr_set(ctx.ctx, scmp_filter_attr::SCMP_FLTATR_CTL_NNP, 0) }; - if ret != 0 { - bail!( - "Failed to unset the no new privileges bit for seccomp: {}", - ret - ); - } - - if let Some(syscalls) = seccomp.syscalls.as_ref() { - for syscall in syscalls { - let action = translate_action(&syscall.action, syscall.errno_ret); - if action == default_action { - // When the action is the same as the default action, the rule is redundent. We can - // skip this here to avoid failing when we add the rules. - log::warn!( - "Detect a seccomp action that is the same as the default action: {:?}", - syscall - ); - continue; - } - - for name in &syscall.names { - let ret = translate_syscall(name.clone()); - if ret.is_err() { - // If we failed to resolve the syscall by name, likely the kernel - // doeesn't support this syscall. So it is safe to skip... - log::warn!( - "Failed to resolve syscall, likely kernel doesn't support this. {:?}", - name - ); - continue; - } - - let syscall_number = translate_syscall(name.clone())?; - // Not clear why but if there are multiple arg attached to one - // syscall rule, we have to add them seperatly. add_rule will - // return EINVAL. runc does the same but doesn't explain why. - match syscall.args.as_ref() { - Some(args) => { - for arg in args { - let mut rule = Rule::new(action, syscall_number); - let cmp = Compare::new(arg.index as u32) - .op(translate_op(&arg.op)) - .datum_a(arg.value) - .datum_b(arg.value_two.unwrap_or(0)) - .build() - .context("Failed to build a seccomp compare rule")?; - rule.add_comparator(cmp); - ctx.add_rule(&rule).with_context(|| { - format!( - "Failed to add seccomp rule: {:?}. Syscall: {:?}", - &rule, name, - ) - })?; - } - } - None => { - let rule = Rule::new(action, syscall_number); - ctx.add_rule(&rule).with_context(|| { - format!( - "Failed to add seccomp rule: {:?}. Syscall: {:?}", - &rule, name, - ) - })?; - } - } - } - } - } - - // In order to use the SECCOMP_SET_MODE_FILTER operation, either the calling - // thread must have the CAP_SYS_ADMIN capability in its user namespace, or - // the thread must already have the no_new_privs bit set. - // Ref: https://man7.org/linux/man-pages/man2/seccomp.2.html - ctx.load().context("Failed to load seccomp context")?; - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - use anyhow::Result; - use mio::unix::pipe; - use nix::sys::wait; - use oci_spec::runtime::{Arch, LinuxSeccomp, LinuxSyscall}; - use serial_test::serial; - use std::io::Read; - use std::io::Write; - use std::os::unix::prelude::AsRawFd; - use std::path; - - #[test] - #[serial] - fn test_basic() -> Result<()> { - // Note: seccomp profile is really hard to write unit test for. First, - // we can't really test default error or kill action, since rust test - // actually replies on certain syscalls. Second, some of the syscall - // will not return errorno. These syscalls will just send an abort - // signal or even just segfaults. Here we choose to use `getcwd` - // syscall for testing. This is more of a sanity check. - - let mut seccomp_profile = LinuxSeccomp::default(); - seccomp_profile.default_action = LinuxSeccompAction::ScmpActAllow; - seccomp_profile.architectures = Some(vec![Arch::ScmpArchNative]); - let mut seccomp_syscall = LinuxSyscall::default(); - // Here, we choose an error that getcwd call would never return on its own, so - // we can make sure that getcwd failed because of seccomp rule. - let expect_error = libc::EAGAIN; - seccomp_syscall.names = vec![String::from("getcwd"), String::from("setuid")]; - seccomp_syscall.action = LinuxSeccompAction::ScmpActErrno; - seccomp_syscall.errno_ret = Some(expect_error as u32); - seccomp_profile.syscalls = Some(vec![seccomp_syscall]); - - // Since Rust cargo test uses a single process to execute all tests, it - // is a good idea to fork a child process to test the seccomp profile, - // and then kill the process. This way, the main test process is - // unaffected. The child process will pass the returned error code - // to the parent for assert and checking. - let (mut sender, mut receiver) = pipe::new()?; - receiver - .set_nonblocking(false) - .with_context(|| "Failed to set channel receiver to blocking")?; - - match unsafe { nix::unistd::fork()? } { - nix::unistd::ForkResult::Parent { child } => { - nix::unistd::close(sender.as_raw_fd())?; - let mut buf = [0; 4]; - receiver - .read_exact(&mut buf) - .context("Failed to wait from child")?; - assert_eq!(i32::from_be_bytes(buf), expect_error); - wait::waitpid(child, None)?; - } - nix::unistd::ForkResult::Child => { - nix::unistd::close(receiver.as_raw_fd())?; - let _ = prctl::set_no_new_privileges(true); - initialize_seccomp(Some(&seccomp_profile))?; - let ret = nix::unistd::getcwd(); - let errno: i32 = if ret.is_err() { - ret.err().unwrap() as i32 - } else { - 0 - }; - sender.write_all(&errno.to_be_bytes())?; - std::process::exit(errno); - } - } - - Ok(()) - } - - #[test] - #[serial] - fn test_moby() -> Result<()> { - let fixture_path = - path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/seccomp/fixture/config.json"); - let spec = oci_spec::runtime::Spec::load(fixture_path) - .context("Failed to load test spec for seccomp")?; - - // We know linux and seccomp exist, so let's just unwrap. - let seccomp_profile = spec.linux.unwrap().seccomp.unwrap(); - match unsafe { nix::unistd::fork()? } { - nix::unistd::ForkResult::Parent { child } => { - let status = wait::waitpid(child, None)?; - match status { - wait::WaitStatus::Exited(_, exit_code) => { - assert_eq!( - exit_code, 0, - "Child process didn't configure seccomp profile correctly" - ); - } - _ => { - bail!("Child process failed to exit correctly: {:?}", status); - } - } - } - nix::unistd::ForkResult::Child => { - let _ = prctl::set_no_new_privileges(true); - let ret = initialize_seccomp(Some(&seccomp_profile)); - let exit_code = if ret.is_ok() { 0 } else { -1 }; - std::process::exit(exit_code); - } - } - - Ok(()) - } -} From 4d147e66f66800238b21b9b06b7b48e162ab3534 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 14 Sep 2021 11:12:00 +0200 Subject: [PATCH 06/10] fix: seccomp should run before drop cap --- src/process/init.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/process/init.rs b/src/process/init.rs index d871a1237..10f9c4877 100644 --- a/src/process/init.rs +++ b/src/process/init.rs @@ -299,6 +299,13 @@ pub fn container_init( .set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid)) .context("Failed to configure uid and gid")?; + // Without no new privileges, seccomp is a privileged operation. We have to + // do this before dropping capabilities. Otherwise, we should do it later, + // as close to exec as possible. + if linux.seccomp.is_some() && proc.no_new_privileges.is_none() { + seccomp::initialize_seccomp(linux.seccomp.as_ref()).context("Failed to execute seccomp")?; + } + capabilities::reset_effective(command).context("Failed to reset effective capabilities")?; if let Some(caps) = &proc.capabilities { capabilities::drop_privileges(caps, command).context("Failed to drop capabilities")?; @@ -377,9 +384,11 @@ pub fn container_init( } } - // Initialize seccomp profile right before we are ready to execute the - // payload. The notify socket will still need network related syscalls. - seccomp::initialize_seccomp(linux.seccomp.as_ref()).context("Failed to execute seccomp")?; + if linux.seccomp.is_some() && proc.no_new_privileges.is_some() { + // Initialize seccomp profile right before we are ready to execute the + // payload. The notify socket will still need network related syscalls. + seccomp::initialize_seccomp(linux.seccomp.as_ref()).context("Failed to execute seccomp")?; + } if let Some(args) = proc.args.as_ref() { utils::do_exec(&args[0], args)?; From f8925313cf50b6d3fc51ed782f680b26143332b0 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Wed, 15 Sep 2021 20:21:35 +0200 Subject: [PATCH 07/10] address review --- src/process/init.rs | 6 ++++-- src/seccomp/mod.rs | 27 ++++++++++----------------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/process/init.rs b/src/process/init.rs index 10f9c4877..bacd27990 100644 --- a/src/process/init.rs +++ b/src/process/init.rs @@ -303,7 +303,8 @@ pub fn container_init( // do this before dropping capabilities. Otherwise, we should do it later, // as close to exec as possible. if linux.seccomp.is_some() && proc.no_new_privileges.is_none() { - seccomp::initialize_seccomp(linux.seccomp.as_ref()).context("Failed to execute seccomp")?; + seccomp::initialize_seccomp(linux.seccomp.as_ref().unwrap()) + .context("Failed to execute seccomp")?; } capabilities::reset_effective(command).context("Failed to reset effective capabilities")?; @@ -387,7 +388,8 @@ pub fn container_init( if linux.seccomp.is_some() && proc.no_new_privileges.is_some() { // Initialize seccomp profile right before we are ready to execute the // payload. The notify socket will still need network related syscalls. - seccomp::initialize_seccomp(linux.seccomp.as_ref()).context("Failed to execute seccomp")?; + seccomp::initialize_seccomp(linux.seccomp.as_ref().unwrap()) + .context("Failed to execute seccomp")?; } if let Some(args) = proc.args.as_ref() { diff --git a/src/seccomp/mod.rs b/src/seccomp/mod.rs index bb1217675..bcbfee2b7 100644 --- a/src/seccomp/mod.rs +++ b/src/seccomp/mod.rs @@ -48,11 +48,11 @@ impl Compare { } pub fn build(self) -> Result { - if self.op.is_some() && self.datum_a.is_some() { + if let (Some(op), Some(datum_a)) = (self.op, self.datum_a) { Ok(scmp_arg_cmp { arg: self.arg, - op: self.op.unwrap(), - datum_a: self.datum_a.unwrap(), + op, + datum_a, datum_b: self.datum_b.unwrap_or(0), }) } else { @@ -138,8 +138,8 @@ impl FilterContext { } } -fn translate_syscall(syscall_name: String) -> Result { - let c_syscall_name = CString::new(syscall_name.as_str()) +fn translate_syscall(syscall_name: &str) -> Result { + let c_syscall_name = CString::new(syscall_name) .with_context(|| format!("Failed to convert syscall {:?} to cstring", syscall_name))?; let res = unsafe { seccomp_syscall_resolve_name(c_syscall_name.as_ptr()) }; if res == __NR_SCMP_ERROR { @@ -197,19 +197,12 @@ fn translate_arch(arch: &Arch) -> scmp_arch { } } -pub fn initialize_seccomp(seccomp: Option<&LinuxSeccomp>) -> Result<()> { - if seccomp.is_none() { - return Ok(()); - } - - let seccomp = seccomp.unwrap(); +pub fn initialize_seccomp(seccomp: &LinuxSeccomp) -> Result<()> { if seccomp.flags.is_some() { // runc did not support this, so let's skip it for now. bail!("seccomp flags are not yet supported"); } - // log::debug!("XXX seccomp: {:?}", seccomp); - // TODO: fix default action error number. The spec repo doesn't have it yet. let default_action = translate_action(&seccomp.default_action, None); let mut ctx = FilterContext::default(default_action)?; @@ -251,7 +244,7 @@ pub fn initialize_seccomp(seccomp: Option<&LinuxSeccomp>) -> Result<()> { } for name in &syscall.names { - let ret = translate_syscall(name.clone()); + let ret = translate_syscall(name); if ret.is_err() { // If we failed to resolve the syscall by name, likely the kernel // doeesn't support this syscall. So it is safe to skip... @@ -262,7 +255,7 @@ pub fn initialize_seccomp(seccomp: Option<&LinuxSeccomp>) -> Result<()> { continue; } - let syscall_number = translate_syscall(name.clone())?; + let syscall_number = translate_syscall(name)?; // Not clear why but if there are multiple arg attached to one // syscall rule, we have to add them seperatly. add_rule will // return EINVAL. runc does the same but doesn't explain why. @@ -370,7 +363,7 @@ mod tests { nix::unistd::ForkResult::Child => { nix::unistd::close(receiver.as_raw_fd())?; let _ = prctl::set_no_new_privileges(true); - initialize_seccomp(Some(&seccomp_profile))?; + initialize_seccomp(&seccomp_profile)?; let ret = nix::unistd::getcwd(); let errno: i32 = if ret.is_err() { ret.err().unwrap() as i32 @@ -412,7 +405,7 @@ mod tests { } nix::unistd::ForkResult::Child => { let _ = prctl::set_no_new_privileges(true); - let ret = initialize_seccomp(Some(&seccomp_profile)); + let ret = initialize_seccomp(&seccomp_profile); let exit_code = if ret.is_ok() { 0 } else { -1 }; std::process::exit(exit_code); } From fc2115855533cb06929772528012a8ed236c3e59 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Wed, 15 Sep 2021 23:50:47 +0200 Subject: [PATCH 08/10] address review --- src/seccomp/mod.rs | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/src/seccomp/mod.rs b/src/seccomp/mod.rs index bcbfee2b7..509174435 100644 --- a/src/seccomp/mod.rs +++ b/src/seccomp/mod.rs @@ -53,6 +53,10 @@ impl Compare { arg: self.arg, op, datum_a, + // datum_b is optional for a number of op, since these op only + // requires one value. For example, the SCMP_OP_EQ or equal op + // requires only one value. We set the datum_b to 0 in the case + // that only one value is required. datum_b: self.datum_b.unwrap_or(0), }) } else { @@ -244,18 +248,18 @@ pub fn initialize_seccomp(seccomp: &LinuxSeccomp) -> Result<()> { } for name in &syscall.names { - let ret = translate_syscall(name); - if ret.is_err() { - // If we failed to resolve the syscall by name, likely the kernel - // doeesn't support this syscall. So it is safe to skip... - log::warn!( - "Failed to resolve syscall, likely kernel doesn't support this. {:?}", - name - ); - continue; - } - - let syscall_number = translate_syscall(name)?; + let syscall_number = match translate_syscall(name) { + Ok(x) => x, + Err(_) => { + // If we failed to resolve the syscall by name, likely the kernel + // doeesn't support this syscall. So it is safe to skip... + log::warn!( + "Failed to resolve syscall, likely kernel doesn't support this. {:?}", + name + ); + continue; + } + }; // Not clear why but if there are multiple arg attached to one // syscall rule, we have to add them seperatly. add_rule will // return EINVAL. runc does the same but doesn't explain why. @@ -319,10 +323,11 @@ mod tests { fn test_basic() -> Result<()> { // Note: seccomp profile is really hard to write unit test for. First, // we can't really test default error or kill action, since rust test - // actually replies on certain syscalls. Second, some of the syscall - // will not return errorno. These syscalls will just send an abort - // signal or even just segfaults. Here we choose to use `getcwd` - // syscall for testing. This is more of a sanity check. + // actually relies on certain syscalls. Second, some of the syscall will + // not return errorno. These syscalls will just send an abort signal or + // even just segfaults. Here we choose to use `getcwd` syscall for + // testing, since it will correctly return an error under seccomp rule. + // This is more of a sanity check. // Here, we choose an error that getcwd call would never return on its own, so // we can make sure that getcwd failed because of seccomp rule. @@ -333,7 +338,7 @@ mod tests { architectures: Some(vec![Arch::ScmpArchNative]), flags: None, syscalls: Some(vec![LinuxSyscall { - names: vec![String::from("getcwd"), String::from("setuid")], + names: vec![String::from("getcwd")], action: LinuxSeccompAction::ScmpActErrno, errno_ret: Some(expect_error as u32), args: None, From 57d436290e15732d01e43ebd2e5e147f4944bbe9 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Thu, 16 Sep 2021 07:37:55 +0200 Subject: [PATCH 09/10] Adds a seccomp binding --- Cargo.lock | 8 +- Cargo.toml | 3 +- seccomp/Cargo.toml | 9 + seccomp/README.md | 11 + seccomp/src/lib.rs | 664 +++++++++++++++++++++++++++++++++++++++++++++ src/seccomp/mod.rs | 8 +- 6 files changed, 693 insertions(+), 10 deletions(-) create mode 100644 seccomp/Cargo.toml create mode 100644 seccomp/README.md create mode 100644 seccomp/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 3b6952336..b37742cbb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -891,12 +891,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] -name = "seccomp-sys" -version = "0.1.3" -source = "git+https://github.com/polachok/seccomp-sys.git?rev=9d89b10f9faa19e8f4e952663697ec126f2e2121#9d89b10f9faa19e8f4e952663697ec126f2e2121" +name = "seccomp" +version = "0.1.0" dependencies = [ "libc", - "pkg-config", ] [[package]] @@ -1160,7 +1158,7 @@ dependencies = [ "prctl", "procfs", "quickcheck", - "seccomp-sys", + "seccomp", "serde", "serde_json", "serial_test", diff --git a/Cargo.toml b/Cargo.toml index a92a4c7a9..7ab21fe67 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ description = "A container runtime written in Rust" [workspace] members = [ "cgroups", + "seccomp", ] [features] @@ -41,7 +42,7 @@ dbus = "0.9.2" tabwriter = "1" fastrand = "1.4.1" crossbeam-channel = "0.5" -seccomp-sys = { git = "https://github.com/polachok/seccomp-sys.git", rev = "9d89b10f9faa19e8f4e952663697ec126f2e2121"} +seccomp = { version = "0.1.0", path = "./seccomp" } [dev-dependencies] oci-spec = { git = "https://github.com/utam0k/oci-spec-rs/", tag = "v0.4.0-with-bugfix", features = ["proptests"] } diff --git a/seccomp/Cargo.toml b/seccomp/Cargo.toml new file mode 100644 index 000000000..097d5e332 --- /dev/null +++ b/seccomp/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "seccomp" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +libc = "0.2.84" \ No newline at end of file diff --git a/seccomp/README.md b/seccomp/README.md new file mode 100644 index 000000000..31f7ae3e0 --- /dev/null +++ b/seccomp/README.md @@ -0,0 +1,11 @@ +# Bindings to libseccomp + +This crate contains a rust FFI binding to +[libseccomp](https://github.com/seccomp/libseccomp). + +The code is adapted from auto generated code using +[rust-bindgen](https://github.com/rust-lang/rust-bindgen). The `rust-bindgen` +has some issue with detecting function macro, which `libseccomp` uses. We +decided to manually fix the issue and include the bindings in this crate. + +The header file used: diff --git a/seccomp/src/lib.rs b/seccomp/src/lib.rs new file mode 100644 index 000000000..d54843db0 --- /dev/null +++ b/seccomp/src/lib.rs @@ -0,0 +1,664 @@ +extern crate libc; + +#[allow(non_camel_case_types)] +pub type __s8 = ::std::os::raw::c_schar; +#[allow(non_camel_case_types)] +pub type __u8 = ::std::os::raw::c_uchar; +#[allow(non_camel_case_types)] +pub type __s16 = ::std::os::raw::c_short; +#[allow(non_camel_case_types)] +pub type __u16 = ::std::os::raw::c_ushort; +#[allow(non_camel_case_types)] +pub type __s32 = ::std::os::raw::c_int; +#[allow(non_camel_case_types)] +pub type __u32 = ::std::os::raw::c_uint; +#[allow(non_camel_case_types)] +pub type __s64 = ::std::os::raw::c_longlong; +#[allow(non_camel_case_types)] +pub type __u64 = ::std::os::raw::c_ulonglong; + +pub const SCMP_VER_MAJOR: u32 = 2; +pub const SCMP_VER_MINOR: u32 = 5; +pub const SCMP_VER_MICRO: u32 = 1; + +pub const __NR_SCMP_ERROR: i32 = -1; +pub const __NR_SCMP_UNDEF: i32 = -2; + +#[allow(non_camel_case_types)] +#[derive(Debug, Clone, Copy)] +#[repr(C)] +pub enum scmp_arch { + SCMP_ARCH_NATIVE = 0, + SCMP_ARCH_X86 = 1073741827, + SCMP_ARCH_X86_64 = 3221225534, + SCMP_ARCH_X32 = 1073741886, + SCMP_ARCH_ARM = 1073741864, + SCMP_ARCH_AARCH64 = 3221225655, + SCMP_ARCH_MIPS = 8, + SCMP_ARCH_MIPS64 = 2147483656, + SCMP_ARCH_MIPS64N32 = 2684354568, + SCMP_ARCH_MIPSEL = 1073741832, + SCMP_ARCH_MIPSEL64 = 3221225480, + SCMP_ARCH_MIPSEL64N32 = 3758096392, + SCMP_ARCH_PPC = 20, + SCMP_ARCH_PPC64 = 2147483669, + SCMP_ARCH_PPC64LE = 3221225493, + SCMP_ARCH_S390 = 22, + SCMP_ARCH_S390X = 2147483670, + SCMP_ARCH_PARISC = 15, + SCMP_ARCH_PARISC64 = 2147483663, + SCMP_ARCH_RISCV64 = 3221225715, +} + +pub const SCMP_ACT_KILL_PROCESS: u32 = 2147483648; +pub const SCMP_ACT_KILL_THREAD: u32 = 0; +pub const SCMP_ACT_KILL: u32 = 0; +pub const SCMP_ACT_TRAP: u32 = 196608; +pub const SCMP_ACT_NOTIFY: u32 = 2143289344; +pub const SCMP_ACT_LOG: u32 = 2147221504; +pub const SCMP_ACT_ALLOW: u32 = 2147418112; +#[allow(non_snake_case)] +pub fn SCMP_ACT_ERRNO(x: u32) -> u32 { + 0x00050000 | ((x) & 0x0000ffff) +} +#[allow(non_snake_case)] +pub fn SCMP_ACT_TRACE(x: u32) -> u32 { + 0x7ff00000 | ((x) & 0x0000ffff) +} + +#[allow(non_camel_case_types)] +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub enum scmp_filter_attr { + _SCMP_FLTATR_MIN, + SCMP_FLTATR_ACT_DEFAULT, + SCMP_FLTATR_ACT_BADARCH, + SCMP_FLTATR_CTL_NNP, + SCMP_FLTATR_CTL_TSYNC, + SCMP_FLTATR_API_TSKIP, + SCMP_FLTATR_CTL_LOG, + SCMP_FLTATR_CTL_SSB, + SCMP_FLTATR_CTL_OPTIMIZE, + SCMP_FLTATR_API_SYSRAWRC, + _SCMP_FLTATR_MAX, +} + +#[allow(non_camel_case_types)] +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub enum scmp_compare { + _SCMP_CMP_MIN = 0, + SCMP_CMP_NE = 1, + SCMP_CMP_LT = 2, + SCMP_CMP_LE = 3, + SCMP_CMP_EQ = 4, + SCMP_CMP_GE = 5, + SCMP_CMP_GT = 6, + SCMP_CMP_MASKED_EQ = 7, + _SCMP_CMP_MAX = 8, +} + +#[allow(non_camel_case_types)] +pub type scmp_datum_t = u64; + +#[allow(non_camel_case_types)] +pub type scmp_filter_ctx = *mut ::std::os::raw::c_void; + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct scmp_version { + pub major: ::std::os::raw::c_uint, + pub minor: ::std::os::raw::c_uint, + pub micro: ::std::os::raw::c_uint, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct scmp_arg_cmp { + pub arg: ::std::os::raw::c_uint, + pub op: scmp_compare, + pub datum_a: scmp_datum_t, + pub datum_b: scmp_datum_t, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct seccomp_data { + pub nr: ::std::os::raw::c_int, + pub arch: __u32, + pub instruction_pointer: __u64, + pub args: [__u64; 6usize], +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct seccomp_notif_sizes { + pub seccomp_notif: __u16, + pub seccomp_notif_resp: __u16, + pub seccomp_data: __u16, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct seccomp_notif { + pub id: __u64, + pub pid: __u32, + pub flags: __u32, + pub data: seccomp_data, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct seccomp_notif_resp { + pub id: __u64, + pub val: __s64, + pub error: __s32, + pub flags: __u32, +} + +#[link(name = "seccomp")] +extern "C" { + /** + * Query the library version information + * + * This function returns a pointer to a populated scmp_version struct, the + * caller does not need to free the structure when finished. + * + */ + pub fn seccomp_version() -> *const scmp_version; + + /** + * Query the library's level of API support + * + * This function returns an API level value indicating the current supported + * functionality. It is important to note that this level of support is + * determined at runtime and therefore can change based on the running kernel + * and system configuration (e.g. any previously loaded seccomp filters). This + * function can be called multiple times, but it only queries the system the + * first time it is called, the API level is cached and used in subsequent + * calls. + * + * The current API levels are described below: + * 0 : reserved + * 1 : base level + * 2 : support for the SCMP_FLTATR_CTL_TSYNC filter attribute + * uses the seccomp(2) syscall instead of the prctl(2) syscall + * 3 : support for the SCMP_FLTATR_CTL_LOG filter attribute + * support for the SCMP_ACT_LOG action + * support for the SCMP_ACT_KILL_PROCESS action + * 4 : support for the SCMP_FLTATR_CTL_SSB filter attrbute + * 5 : support for the SCMP_ACT_NOTIFY action and notify APIs + * 6 : support the simultaneous use of SCMP_FLTATR_CTL_TSYNC and notify APIs + * + */ + pub fn seccomp_api_get() -> ::std::os::raw::c_uint; + + /** + * Set the library's level of API support + * + * This function forcibly sets the API level of the library at runtime. Valid + * API levels are discussed in the description of the seccomp_api_get() + * function. General use of this function is strongly discouraged. + * + */ + pub fn seccomp_api_set(level: ::std::os::raw::c_uint) -> ::std::os::raw::c_int; + + /** + * Initialize the filter state + * @param def_action the default filter action + * + * This function initializes the internal seccomp filter state and should + * be called before any other functions in this library to ensure the filter + * state is initialized. Returns a filter context on success, NULL on failure. + * + */ + pub fn seccomp_init(def_action: u32) -> scmp_filter_ctx; + + /** + * Reset the filter state + * @param ctx the filter context + * @param def_action the default filter action + * + * This function resets the given seccomp filter state and ensures the + * filter state is reinitialized. This function does not reset any seccomp + * filters already loaded into the kernel. Returns zero on success, negative + * values on failure. + * + */ + pub fn seccomp_reset(ctx: scmp_filter_ctx, def_action: u32) -> ::std::os::raw::c_int; + + /** + * Destroys the filter state and releases any resources + * @param ctx the filter context + * + * This functions destroys the given seccomp filter state and releases any + * resources, including memory, associated with the filter state. This + * function does not reset any seccomp filters already loaded into the kernel. + * The filter context can no longer be used after calling this function. + * + */ + pub fn seccomp_release(ctx: scmp_filter_ctx); + + /** + * Merge two filters + * @param ctx_dst the destination filter context + * @param ctx_src the source filter context + * + * This function merges two filter contexts into a single filter context and + * destroys the second filter context. The two filter contexts must have the + * same attribute values and not contain any of the same architectures; if they + * do, the merge operation will fail. On success, the source filter context + * will be destroyed and should no longer be used; it is not necessary to + * call seccomp_release() on the source filter context. Returns zero on + * success, negative values on failure. + * + */ + pub fn seccomp_merge( + ctx_dst: scmp_filter_ctx, + ctx_src: scmp_filter_ctx, + ) -> ::std::os::raw::c_int; + + /** + * Resolve the architecture name to a architecture token + * @param arch_name the architecture name + * + * This function resolves the given architecture name to a token suitable for + * use with libseccomp, returns zero on failure. + * + */ + pub fn seccomp_arch_resolve_name(arch_name: *const ::std::os::raw::c_char) -> u32; + + /** + * Return the native architecture token + * + * This function returns the native architecture token value, e.g. SCMP_ARCH_*. + * + */ + pub fn seccomp_arch_native() -> u32; + + /** + * Check to see if an existing architecture is present in the filter + * @param ctx the filter context + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * + * This function tests to see if a given architecture is included in the filter + * context. If the architecture token is SCMP_ARCH_NATIVE then the native + * architecture will be assumed. Returns zero if the architecture exists in + * the filter, -EEXIST if it is not present, and other negative values on + * failure. + * + */ + pub fn seccomp_arch_exist(ctx: scmp_filter_ctx, arch_token: u32) -> ::std::os::raw::c_int; + + /** + * Adds an architecture to the filter + * @param ctx the filter context + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * + * This function adds a new architecture to the given seccomp filter context. + * Any new rules added after this function successfully returns will be added + * to this architecture but existing rules will not be added to this + * architecture. If the architecture token is SCMP_ARCH_NATIVE then the native + * architecture will be assumed. Returns zero on success, -EEXIST if + * specified architecture is already present, other negative values on failure. + * + */ + pub fn seccomp_arch_add(ctx: scmp_filter_ctx, arch_token: u32) -> ::std::os::raw::c_int; + + /** + * Removes an architecture from the filter + * @param ctx the filter context + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * + * This function removes an architecture from the given seccomp filter context. + * If the architecture token is SCMP_ARCH_NATIVE then the native architecture + * will be assumed. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_arch_remove(ctx: scmp_filter_ctx, arch_token: u32) -> ::std::os::raw::c_int; + + /** + * Loads the filter into the kernel + * @param ctx the filter context + * + * This function loads the given seccomp filter context into the kernel. If + * the filter was loaded correctly, the kernel will be enforcing the filter + * when this function returns. Returns zero on success, negative values on + * error. + * + */ + pub fn seccomp_load(ctx: scmp_filter_ctx) -> ::std::os::raw::c_int; + + /** + * Get the value of a filter attribute + * @param ctx the filter context + * @param attr the filter attribute name + * @param value the filter attribute value + * + * This function fetches the value of the given attribute name and returns it + * via @value. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_attr_get( + ctx: scmp_filter_ctx, + attr: scmp_filter_attr, + value: *mut u32, + ) -> ::std::os::raw::c_int; + + /** + * Set the value of a filter attribute + * @param ctx the filter context + * @param attr the filter attribute name + * @param value the filter attribute value + * + * This function sets the value of the given attribute. Returns zero on + * success, negative values on failure. + * + */ + pub fn seccomp_attr_set( + ctx: scmp_filter_ctx, + attr: scmp_filter_attr, + value: u32, + ) -> ::std::os::raw::c_int; + + /** + * Resolve a syscall number to a name + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * @param num the syscall number + * + * Resolve the given syscall number to the syscall name for the given + * architecture; it is up to the caller to free the returned string. Returns + * the syscall name on success, NULL on failure. + * + */ + pub fn seccomp_syscall_resolve_num_arch( + arch_token: u32, + num: ::std::os::raw::c_int, + ) -> *mut ::std::os::raw::c_char; + + /** + * Resolve a syscall name to a number + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * @param name the syscall name + * + * Resolve the given syscall name to the syscall number for the given + * architecture. Returns the syscall number on success, including negative + * pseudo syscall numbers (e.g. __PNR_*); returns __NR_SCMP_ERROR on failure. + * + */ + pub fn seccomp_syscall_resolve_name_arch( + arch_token: u32, + name: *const ::std::os::raw::c_char, + ) -> ::std::os::raw::c_int; + + /** + * Resolve a syscall name to a number and perform any rewriting necessary + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * @param name the syscall name + * + * Resolve the given syscall name to the syscall number for the given + * architecture and do any necessary syscall rewriting needed by the + * architecture. Returns the syscall number on success, including negative + * pseudo syscall numbers (e.g. __PNR_*); returns __NR_SCMP_ERROR on failure. + * + */ + pub fn seccomp_syscall_resolve_name_rewrite( + arch_token: u32, + name: *const ::std::os::raw::c_char, + ) -> ::std::os::raw::c_int; + + /** + * Resolve a syscall name to a number + * @param name the syscall name + * + * Resolve the given syscall name to the syscall number. Returns the syscall + * number on success, including negative pseudo syscall numbers (e.g. __PNR_*); + * returns __NR_SCMP_ERROR on failure. + * + */ + pub fn seccomp_syscall_resolve_name( + name: *const ::std::os::raw::c_char, + ) -> ::std::os::raw::c_int; + + /** + * Set the priority of a given syscall + * @param ctx the filter context + * @param syscall the syscall number + * @param priority priority value, higher value == higher priority + * + * This function sets the priority of the given syscall; this value is used + * when generating the seccomp filter code such that higher priority syscalls + * will incur less filter code overhead than the lower priority syscalls in the + * filter. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_syscall_priority( + ctx: scmp_filter_ctx, + syscall: ::std::os::raw::c_int, + priority: u8, + ) -> ::std::os::raw::c_int; + + /** + * Add a new rule to the filter + * @param ctx the filter context + * @param action the filter action + * @param syscall the syscall number + * @param arg_cnt the number of argument filters in the argument filter chain + * @param ... scmp_arg_cmp structs (use of SCMP_ARG_CMP() recommended) + * + * This function adds a series of new argument/value checks to the seccomp + * filter for the given syscall; multiple argument/value checks can be + * specified and they will be chained together (AND'd together) in the filter. + * If the specified rule needs to be adjusted due to architecture specifics it + * will be adjusted without notification. Returns zero on success, negative + * values on failure. + * + */ + pub fn seccomp_rule_add( + ctx: scmp_filter_ctx, + action: u32, + syscall: ::std::os::raw::c_int, + arg_cnt: ::std::os::raw::c_uint, + ... + ) -> ::std::os::raw::c_int; + + /** + * Add a new rule to the filter + * @param ctx the filter context + * @param action the filter action + * @param syscall the syscall number + * @param arg_cnt the number of elements in the arg_array parameter + * @param arg_array array of scmp_arg_cmp structs + * + * This function adds a series of new argument/value checks to the seccomp + * filter for the given syscall; multiple argument/value checks can be + * specified and they will be chained together (AND'd together) in the filter. + * If the specified rule needs to be adjusted due to architecture specifics it + * will be adjusted without notification. Returns zero on success, negative + * values on failure. + * + */ + pub fn seccomp_rule_add_array( + ctx: scmp_filter_ctx, + action: u32, + syscall: ::std::os::raw::c_int, + arg_cnt: ::std::os::raw::c_uint, + arg_array: *const scmp_arg_cmp, + ) -> ::std::os::raw::c_int; + + /** + * Add a new rule to the filter + * @param ctx the filter context + * @param action the filter action + * @param syscall the syscall number + * @param arg_cnt the number of argument filters in the argument filter chain + * @param ... scmp_arg_cmp structs (use of SCMP_ARG_CMP() recommended) + * + * This function adds a series of new argument/value checks to the seccomp + * filter for the given syscall; multiple argument/value checks can be + * specified and they will be chained together (AND'd together) in the filter. + * If the specified rule can not be represented on the architecture the + * function will fail. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_rule_add_exact( + ctx: scmp_filter_ctx, + action: u32, + syscall: ::std::os::raw::c_int, + arg_cnt: ::std::os::raw::c_uint, + ... + ) -> ::std::os::raw::c_int; + + /** + * Add a new rule to the filter + * @param ctx the filter context + * @param action the filter action + * @param syscall the syscall number + * @param arg_cnt the number of elements in the arg_array parameter + * @param arg_array array of scmp_arg_cmp structs + * + * This function adds a series of new argument/value checks to the seccomp + * filter for the given syscall; multiple argument/value checks can be + * specified and they will be chained together (AND'd together) in the filter. + * If the specified rule can not be represented on the architecture the + * function will fail. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_rule_add_exact_array( + ctx: scmp_filter_ctx, + action: u32, + syscall: ::std::os::raw::c_int, + arg_cnt: ::std::os::raw::c_uint, + arg_array: *const scmp_arg_cmp, + ) -> ::std::os::raw::c_int; + + /** + * Allocate a pair of notification request/response structures + * @param req the request location + * @param resp the response location + * + * This function allocates a pair of request/response structure by computing + * the correct sized based on the currently running kernel. It returns zero on + * success, and negative values on failure. + * + */ + pub fn seccomp_notify_alloc( + req: *mut *mut seccomp_notif, + resp: *mut *mut seccomp_notif_resp, + ) -> ::std::os::raw::c_int; + + /** + * Free a pair of notification request/response structures. + * @param req the request location + * @param resp the response location + */ + pub fn seccomp_notify_free(req: *mut seccomp_notif, resp: *mut seccomp_notif_resp); + + /** + * Receive a notification from a seccomp notification fd + * @param fd the notification fd + * @param req the request buffer to save into + * + * Blocks waiting for a notification on this fd. This function is thread safe + * (synchronization is performed in the kernel). Returns zero on success, + * negative values on error. + * + */ + pub fn seccomp_notify_receive( + fd: ::std::os::raw::c_int, + req: *mut seccomp_notif, + ) -> ::std::os::raw::c_int; + + /** + * Send a notification response to a seccomp notification fd + * @param fd the notification fd + * @param resp the response buffer to use + * + * Sends a notification response on this fd. This function is thread safe + * (synchronization is performed in the kernel). Returns zero on success, + * negative values on error. + * + */ + pub fn seccomp_notify_respond( + fd: ::std::os::raw::c_int, + resp: *mut seccomp_notif_resp, + ) -> ::std::os::raw::c_int; + + /** + * Check if a notification id is still valid + * @param fd the notification fd + * @param id the id to test + * + * Checks to see if a notification id is still valid. Returns 0 on success, and + * negative values on failure. + * + */ + pub fn seccomp_notify_id_valid(fd: ::std::os::raw::c_int, id: u64) -> ::std::os::raw::c_int; + + /** + * Return the notification fd from a filter that has already been loaded + * @param ctx the filter context + * + * This returns the listener fd that was generated when the seccomp policy was + * loaded. This is only valid after seccomp_load() with a filter that makes + * use of SCMP_ACT_NOTIFY. + * + */ + pub fn seccomp_notify_fd(ctx: scmp_filter_ctx) -> ::std::os::raw::c_int; + + /** + * Generate seccomp Pseudo Filter Code (PFC) and export it to a file + * @param ctx the filter context + * @param fd the destination fd + * + * This function generates seccomp Pseudo Filter Code (PFC) and writes it to + * the given fd. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_export_pfc( + ctx: scmp_filter_ctx, + fd: ::std::os::raw::c_int, + ) -> ::std::os::raw::c_int; + + /** + * Generate seccomp Berkeley Packet Filter (BPF) code and export it to a file + * @param ctx the filter context + * @param fd the destination fd + * + * This function generates seccomp Berkeley Packer Filter (BPF) code and writes + * it to the given fd. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_export_bpf( + ctx: scmp_filter_ctx, + fd: ::std::os::raw::c_int, + ) -> ::std::os::raw::c_int; + +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + // Note: we should probably run this in a different process, since it + // loads a seccomp profile. However, since this is the only test in the + // repo at the moment, this should be OK for now. + unsafe { + let ctx = seccomp_init(SCMP_ACT_ALLOW); + let cmp = scmp_arg_cmp { + arg: 0, + op: scmp_compare::SCMP_CMP_EQ, + datum_a: 1000, + datum_b: 0, + }; + + let c_syscall_name = std::ffi::CString::new("getcwd").unwrap(); + let syscall_number = seccomp_syscall_resolve_name(c_syscall_name.as_ptr()); + + assert!(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(42), syscall_number, 1, cmp) == 0); + assert!(seccomp_load(ctx) == 0); + } + } +} diff --git a/src/seccomp/mod.rs b/src/seccomp/mod.rs index 509174435..285e43fb8 100644 --- a/src/seccomp/mod.rs +++ b/src/seccomp/mod.rs @@ -5,9 +5,9 @@ use oci_spec::runtime::Arch; use oci_spec::runtime::LinuxSeccomp; use oci_spec::runtime::LinuxSeccompAction; use oci_spec::runtime::LinuxSeccompOperator; -use seccomp_sys::scmp_arch::*; -use seccomp_sys::scmp_compare::*; -use seccomp_sys::*; +use seccomp::scmp_arch::*; +use seccomp::scmp_compare::*; +use seccomp::*; use std::ffi::CString; #[derive(Debug)] @@ -88,7 +88,7 @@ impl Rule { #[derive(Debug)] struct FilterContext { - ctx: *mut scmp_filter_ctx, + ctx: scmp_filter_ctx, } impl FilterContext { From c0c51b13abffca7b8a5b25763a4b58f208467514 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Thu, 16 Sep 2021 07:56:05 +0200 Subject: [PATCH 10/10] make clippy happy was failing with :C-like enum variant discriminant is not portable to 32-bit targets --- seccomp/src/lib.rs | 45 +++++++++++++++++++++------------------------ src/seccomp/mod.rs | 1 - 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/seccomp/src/lib.rs b/seccomp/src/lib.rs index d54843db0..116e9a622 100644 --- a/seccomp/src/lib.rs +++ b/seccomp/src/lib.rs @@ -25,30 +25,27 @@ pub const __NR_SCMP_ERROR: i32 = -1; pub const __NR_SCMP_UNDEF: i32 = -2; #[allow(non_camel_case_types)] -#[derive(Debug, Clone, Copy)] -#[repr(C)] -pub enum scmp_arch { - SCMP_ARCH_NATIVE = 0, - SCMP_ARCH_X86 = 1073741827, - SCMP_ARCH_X86_64 = 3221225534, - SCMP_ARCH_X32 = 1073741886, - SCMP_ARCH_ARM = 1073741864, - SCMP_ARCH_AARCH64 = 3221225655, - SCMP_ARCH_MIPS = 8, - SCMP_ARCH_MIPS64 = 2147483656, - SCMP_ARCH_MIPS64N32 = 2684354568, - SCMP_ARCH_MIPSEL = 1073741832, - SCMP_ARCH_MIPSEL64 = 3221225480, - SCMP_ARCH_MIPSEL64N32 = 3758096392, - SCMP_ARCH_PPC = 20, - SCMP_ARCH_PPC64 = 2147483669, - SCMP_ARCH_PPC64LE = 3221225493, - SCMP_ARCH_S390 = 22, - SCMP_ARCH_S390X = 2147483670, - SCMP_ARCH_PARISC = 15, - SCMP_ARCH_PARISC64 = 2147483663, - SCMP_ARCH_RISCV64 = 3221225715, -} +pub type scmp_arch = u32; +pub const SCMP_ARCH_NATIVE: scmp_arch = 0; +pub const SCMP_ARCH_X86: scmp_arch = 1073741827; +pub const SCMP_ARCH_X86_64: scmp_arch = 3221225534; +pub const SCMP_ARCH_X32: scmp_arch = 1073741886; +pub const SCMP_ARCH_ARM: scmp_arch = 1073741864; +pub const SCMP_ARCH_AARCH64: scmp_arch = 3221225655; +pub const SCMP_ARCH_MIPS: scmp_arch = 8; +pub const SCMP_ARCH_MIPS64: scmp_arch = 2147483656; +pub const SCMP_ARCH_MIPS64N32: scmp_arch = 2684354568; +pub const SCMP_ARCH_MIPSEL: scmp_arch = 1073741832; +pub const SCMP_ARCH_MIPSEL64: scmp_arch = 3221225480; +pub const SCMP_ARCH_MIPSEL64N32: scmp_arch = 3758096392; +pub const SCMP_ARCH_PPC: scmp_arch = 20; +pub const SCMP_ARCH_PPC64: scmp_arch = 2147483669; +pub const SCMP_ARCH_PPC64LE: scmp_arch = 3221225493; +pub const SCMP_ARCH_S390: scmp_arch = 22; +pub const SCMP_ARCH_S390X: scmp_arch = 2147483670; +pub const SCMP_ARCH_PARISC: scmp_arch = 15; +pub const SCMP_ARCH_PARISC64: scmp_arch = 2147483663; +pub const SCMP_ARCH_RISCV64: scmp_arch = 3221225715; pub const SCMP_ACT_KILL_PROCESS: u32 = 2147483648; pub const SCMP_ACT_KILL_THREAD: u32 = 0; diff --git a/src/seccomp/mod.rs b/src/seccomp/mod.rs index 285e43fb8..c7690534c 100644 --- a/src/seccomp/mod.rs +++ b/src/seccomp/mod.rs @@ -5,7 +5,6 @@ use oci_spec::runtime::Arch; use oci_spec::runtime::LinuxSeccomp; use oci_spec::runtime::LinuxSeccompAction; use oci_spec::runtime::LinuxSeccompOperator; -use seccomp::scmp_arch::*; use seccomp::scmp_compare::*; use seccomp::*; use std::ffi::CString;