diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 72f98fdde..38a7ea594 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -43,7 +43,7 @@ jobs: working-directory: ./cgroups - run: rustup component add rustfmt clippy - run: sudo apt-get -y update - - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Check formatting run: cargo fmt --all -- --check working-directory: ${{matrix.dirs}} @@ -68,7 +68,7 @@ jobs: with: working-directory: ./cgroups - run: sudo apt-get -y update - - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Run tests run: cargo test --all --all-features --no-fail-fast coverage: @@ -98,7 +98,7 @@ jobs: - name: Update System Libraries run: sudo apt-get -y update - name: Install System Libraries - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Run Test Coverage for youki run: | cargo llvm-cov clean --workspace @@ -143,7 +143,7 @@ jobs: with: working-directory: ./cgroups - run: sudo apt-get -y update - - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Build run: ./build.sh --release - uses: actions/setup-go@v2 diff --git a/Cargo.lock b/Cargo.lock index 2903e044a..3b6952336 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -890,6 +890,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "seccomp-sys" +version = "0.1.3" +source = "git+https://github.com/polachok/seccomp-sys.git?rev=9d89b10f9faa19e8f4e952663697ec126f2e2121#9d89b10f9faa19e8f4e952663697ec126f2e2121" +dependencies = [ + "libc", + "pkg-config", +] + [[package]] name = "serde" version = "1.0.130" @@ -1151,6 +1160,7 @@ dependencies = [ "prctl", "procfs", "quickcheck", + "seccomp-sys", "serde", "serde_json", "serial_test", diff --git a/Cargo.toml b/Cargo.toml index 856d3db63..a92a4c7a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ dbus = "0.9.2" tabwriter = "1" fastrand = "1.4.1" crossbeam-channel = "0.5" +seccomp-sys = { git = "https://github.com/polachok/seccomp-sys.git", rev = "9d89b10f9faa19e8f4e952663697ec126f2e2121"} [dev-dependencies] oci-spec = { git = "https://github.com/utam0k/oci-spec-rs/", tag = "v0.4.0-with-bugfix", features = ["proptests"] } diff --git a/README.md b/README.md index d5e42530a..52cf61971 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,8 @@ $ sudo apt-get install \ libsystemd-dev \ libdbus-glib-1-dev \ build-essential \ - libelf-dev + libelf-dev \ + libseccomp-dev ``` ### Fedora, Centos, RHEL and related distributions @@ -86,6 +87,7 @@ $ sudo dnf install \ systemd-devel \ dbus-devel \ elfutils-libelf-devel \ + libseccomp-devel ``` ## Build diff --git a/src/lib.rs b/src/lib.rs index 6a20a35fd..da320ee1e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ pub mod notify_socket; pub mod process; pub mod rootfs; pub mod rootless; +pub mod seccomp; pub mod signal; pub mod syscall; pub mod tty; diff --git a/src/process/init.rs b/src/process/init.rs index 03eb97e8d..d871a1237 100644 --- a/src/process/init.rs +++ b/src/process/init.rs @@ -1,3 +1,8 @@ +use super::args::ContainerArgs; +use crate::{ + capabilities, hooks, namespaces::Namespaces, process::channel, rootfs, rootless::Rootless, + seccomp, syscall::Syscall, tty, utils, +}; use anyhow::{bail, Context, Result}; use nix::mount::mount as nix_mount; use nix::mount::MsFlags; @@ -9,17 +14,12 @@ use nix::{ }; use oci_spec::runtime::{LinuxNamespaceType, User}; use std::collections::HashMap; -use std::{env, os::unix::io::AsRawFd}; -use std::{fs, path::Path, path::PathBuf}; - -use crate::rootless::Rootless; -use crate::{ - capabilities, hooks, namespaces::Namespaces, process::channel, rootfs, syscall::Syscall, tty, - utils, +use std::{ + env, fs, + os::unix::io::AsRawFd, + path::{Path, PathBuf}, }; -use super::args::ContainerArgs; - // Make sure a given path is on procfs. This is to avoid the security risk that // /proc path is mounted over. Ref: CVE-2019-16884 fn ensure_procfs(path: &Path) -> Result<()> { @@ -377,6 +377,10 @@ pub fn container_init( } } + // Initialize seccomp profile right before we are ready to execute the + // payload. The notify socket will still need network related syscalls. + seccomp::initialize_seccomp(linux.seccomp.as_ref()).context("Failed to execute seccomp")?; + if let Some(args) = proc.args.as_ref() { utils::do_exec(&args[0], args)?; } else { diff --git a/src/seccomp/fixture/config.json b/src/seccomp/fixture/config.json new file mode 100644 index 000000000..ec676fae6 --- /dev/null +++ b/src/seccomp/fixture/config.json @@ -0,0 +1,972 @@ +{ + "ociVersion": "1.0.1-dev", + "process": { + "terminal": false, + "user": { + "uid": 0, + "gid": 0 + }, + "args": [ + "helloworld" + ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "capabilities": { + "bounding": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "effective": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "inheritable": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "permitted": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "ambient": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ] + }, + "rlimits": [ + { + "type": "RLIMIT_NOFILE", + "hard": 1024, + "soft": 1024 + } + ], + "noNewPrivileges": true + }, + "root": { + "path": "tests/assets/oci/helloworld/rootfs" + }, + "hostname": "runc", + "mounts": [ + { + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev", + "type": "tmpfs", + "source": "tmpfs", + "options": [ + "nosuid", + "strictatime", + "mode=755", + "size=65536k" + ] + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + }, + { + "destination": "/dev/shm", + "type": "tmpfs", + "source": "shm", + "options": [ + "nosuid", + "noexec", + "nodev", + "mode=1777", + "size=65536k" + ] + }, + { + "destination": "/dev/mqueue", + "type": "mqueue", + "source": "mqueue", + "options": [ + "nosuid", + "noexec", + "nodev" + ] + }, + { + "destination": "/sys", + "type": "sysfs", + "source": "sysfs", + "options": [ + "nosuid", + "noexec", + "nodev", + "ro" + ] + }, + { + "destination": "/sys/fs/cgroup", + "type": "cgroup", + "source": "cgroup", + "options": [ + "nosuid", + "noexec", + "nodev", + "relatime", + "ro" + ] + } + ], + "linux": { + "devices": [ + { + "path": "/dev/kvm", + "type": "c", + "major": 10, + "minor": 232, + "fileMode": 666, + "uid": 0, + "gid": 36 + } + ], + "seccomp": { + "defaultAction": "SCMP_ACT_ERRNO", + "defaultErrnoRet": 1, + "archMap": [ + { + "architecture": "SCMP_ARCH_X86_64", + "subArchitectures": [ + "SCMP_ARCH_X86", + "SCMP_ARCH_X32" + ] + }, + { + "architecture": "SCMP_ARCH_AARCH64", + "subArchitectures": [ + "SCMP_ARCH_ARM" + ] + }, + { + "architecture": "SCMP_ARCH_MIPS64", + "subArchitectures": [ + "SCMP_ARCH_MIPS", + "SCMP_ARCH_MIPS64N32" + ] + }, + { + "architecture": "SCMP_ARCH_MIPS64N32", + "subArchitectures": [ + "SCMP_ARCH_MIPS", + "SCMP_ARCH_MIPS64" + ] + }, + { + "architecture": "SCMP_ARCH_MIPSEL64", + "subArchitectures": [ + "SCMP_ARCH_MIPSEL", + "SCMP_ARCH_MIPSEL64N32" + ] + }, + { + "architecture": "SCMP_ARCH_MIPSEL64N32", + "subArchitectures": [ + "SCMP_ARCH_MIPSEL", + "SCMP_ARCH_MIPSEL64" + ] + }, + { + "architecture": "SCMP_ARCH_S390X", + "subArchitectures": [ + "SCMP_ARCH_S390" + ] + } + ], + "syscalls": [ + { + "names": [ + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "bind", + "brk", + "capget", + "capset", + "chdir", + "chmod", + "chown", + "chown32", + "clock_adjtime", + "clock_adjtime64", + "clock_getres", + "clock_getres_time64", + "clock_gettime", + "clock_gettime64", + "clock_nanosleep", + "clock_nanosleep_time64", + "close", + "close_range", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "dup3", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_pwait2", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsetxattr", + "fstat", + "fstat64", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futex_time64", + "futimesat", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "get_robust_list", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "get_thread_area", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "ioctl", + "io_destroy", + "io_getevents", + "io_pgetevents", + "io_pgetevents_time64", + "ioprio_get", + "ioprio_set", + "io_setup", + "io_submit", + "io_uring_enter", + "io_uring_register", + "io_uring_setup", + "ipc", + "kill", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listxattr", + "llistxattr", + "_llseek", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "membarrier", + "memfd_create", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedreceive_time64", + "mq_timedsend", + "mq_timedsend_time64", + "mq_unlink", + "mremap", + "msgctl", + "msgget", + "msgrcv", + "msgsnd", + "msync", + "munlock", + "munlockall", + "munmap", + "nanosleep", + "newfstatat", + "_newselect", + "open", + "openat", + "openat2", + "pause", + "pidfd_open", + "pidfd_send_signal", + "pipe", + "pipe2", + "poll", + "ppoll", + "ppoll_time64", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "pselect6", + "pselect6_time64", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readlink", + "readlinkat", + "readv", + "recv", + "recvfrom", + "recvmmsg", + "recvmmsg_time64", + "recvmsg", + "remap_file_pages", + "removexattr", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_sigtimedwait_time64", + "rt_tgsigqueueinfo", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_rr_get_interval_time64", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "semtimedop_time64", + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "set_robust_list", + "setsid", + "setsockopt", + "set_thread_area", + "set_tid_address", + "setuid", + "setuid32", + "setxattr", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaltstack", + "signalfd", + "signalfd4", + "sigprocmask", + "sigreturn", + "socket", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "sysinfo", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timer_getoverrun", + "timer_gettime", + "timer_gettime64", + "timer_settime", + "timer_settime64", + "timerfd_create", + "timerfd_gettime", + "timerfd_gettime64", + "timerfd_settime", + "timerfd_settime64", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "uname", + "unlink", + "unlinkat", + "utime", + "utimensat", + "utimensat_time64", + "utimes", + "vfork", + "vmsplice", + "wait4", + "waitid", + "waitpid", + "write", + "writev" + ], + "action": "SCMP_ACT_ALLOW" + }, + { + "names": [ + "process_vm_readv", + "process_vm_writev", + "ptrace" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "minKernel": "4.8" + } + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 0, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 8, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 131072, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 131080, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 4294967295, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "sync_file_range2" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "ppc64le" + ] + } + }, + { + "names": [ + "arm_fadvise64_64", + "arm_sync_file_range", + "sync_file_range2", + "breakpoint", + "cacheflush", + "set_tls" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "arm", + "arm64" + ] + } + }, + { + "names": [ + "arch_prctl" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "amd64", + "x32" + ] + } + }, + { + "names": [ + "modify_ldt" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "amd64", + "x32", + "x86" + ] + } + }, + { + "names": [ + "s390_pci_mmio_read", + "s390_pci_mmio_write", + "s390_runtime_instr" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "s390", + "s390x" + ] + } + }, + { + "names": [ + "open_by_handle_at" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_DAC_READ_SEARCH" + ] + } + }, + { + "names": [ + "bpf", + "clone", + "clone3", + "fanotify_init", + "fsconfig", + "fsmount", + "fsopen", + "fspick", + "lookup_dcookie", + "mount", + "move_mount", + "name_to_handle_at", + "open_tree", + "perf_event_open", + "quotactl", + "setdomainname", + "sethostname", + "setns", + "syslog", + "umount", + "umount2", + "unshare" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_ADMIN" + ] + } + }, + { + "names": [ + "clone" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 2114060288, + "op": "SCMP_CMP_MASKED_EQ" + } + ], + "excludes": { + "caps": [ + "CAP_SYS_ADMIN" + ], + "arches": [ + "s390", + "s390x" + ] + } + }, + { + "names": [ + "clone" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 1, + "value": 2114060288, + "op": "SCMP_CMP_MASKED_EQ" + } + ], + "comment": "s390 parameter ordering for clone is different", + "includes": { + "arches": [ + "s390", + "s390x" + ] + }, + "excludes": { + "caps": [ + "CAP_SYS_ADMIN" + ] + } + }, + { + "names": [ + "clone3" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 38, + "excludes": { + "caps": [ + "CAP_SYS_ADMIN" + ] + } + }, + { + "names": [ + "reboot" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_BOOT" + ] + } + }, + { + "names": [ + "chroot" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_CHROOT" + ] + } + }, + { + "names": [ + "delete_module", + "init_module", + "finit_module" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_MODULE" + ] + } + }, + { + "names": [ + "acct" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_PACCT" + ] + } + }, + { + "names": [ + "kcmp", + "pidfd_getfd", + "process_madvise", + "process_vm_readv", + "process_vm_writev", + "ptrace" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_PTRACE" + ] + } + }, + { + "names": [ + "iopl", + "ioperm" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_RAWIO" + ] + } + }, + { + "names": [ + "settimeofday", + "stime", + "clock_settime" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_TIME" + ] + } + }, + { + "names": [ + "vhangup" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_TTY_CONFIG" + ] + } + }, + { + "names": [ + "get_mempolicy", + "mbind", + "set_mempolicy" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_NICE" + ] + } + }, + { + "names": [ + "syslog" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYSLOG" + ] + } + } + ] + }, + "resources": { + "devices": [ + { + "allow": true, + "access": "rwm" + } + ] + }, + "uidMappings": [ + { + "containerID": 0, + "hostID": 1000, + "size": 1 + } + ], + "gidMappings": [ + { + "containerID": 0, + "hostID": 1000, + "size": 1 + } + ], + "namespaces": [ + { + "type": "pid" + }, + { + "type": "network" + }, + { + "type": "user" + }, + { + "type": "ipc" + }, + { + "type": "uts" + }, + { + "type": "mount" + } + ], + "maskedPaths": [ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi" + ], + "readonlyPaths": [ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + } +} \ No newline at end of file diff --git a/src/seccomp/mod.rs b/src/seccomp/mod.rs new file mode 100644 index 000000000..1f85a66a3 --- /dev/null +++ b/src/seccomp/mod.rs @@ -0,0 +1,2 @@ +pub mod seccomp; +pub use seccomp::initialize_seccomp; diff --git a/src/seccomp/seccomp.rs b/src/seccomp/seccomp.rs new file mode 100644 index 000000000..0d4ce9df2 --- /dev/null +++ b/src/seccomp/seccomp.rs @@ -0,0 +1,401 @@ +use anyhow::bail; +use anyhow::Context; +use anyhow::Result; +use oci_spec::runtime::Arch; +use oci_spec::runtime::LinuxSeccomp; +use oci_spec::runtime::LinuxSeccompAction; +use oci_spec::runtime::LinuxSeccompOperator; +use seccomp_sys::scmp_arch::*; +use seccomp_sys::scmp_compare::*; +use seccomp_sys::*; +use std::ffi::CString; + +#[derive(Debug)] +pub struct Compare { + // The zero-indexed index of the syscall arguement. + arg: libc::c_uint, + op: Option, + datum_a: Option, + datum_b: Option, +} + +impl Compare { + pub fn new(args: u32) -> Self { + Compare { + arg: args as libc::c_uint, + op: None, + datum_a: None, + datum_b: None, + } + } + + pub fn op(mut self, op: scmp_compare) -> Self { + self.op = Some(op); + + self + } + + pub fn datum_a(mut self, datum: scmp_datum_t) -> Self { + self.datum_a = Some(datum); + + self + } + + pub fn datum_b(mut self, datum: scmp_datum_t) -> Self { + self.datum_b = Some(datum); + + self + } + + pub fn build(self) -> Result { + if self.op.is_some() && self.datum_a.is_some() { + Ok(scmp_arg_cmp { + arg: self.arg, + op: self.op.unwrap().into(), + datum_a: self.datum_a.unwrap(), + datum_b: self.datum_b.unwrap_or(0), + }) + } else { + bail!("op and datum_a is required: {:?}", self); + } + } +} + +#[derive(Debug)] +pub struct Rule { + action: u32, + syscall_nr: i32, + comparators: Vec, +} + +impl Rule { + pub fn new(action: u32, syscall_number: i32) -> Self { + Rule { + action, + syscall_nr: syscall_number, + comparators: vec![], + } + } + + pub fn add_comparator(&mut self, cmp: scmp_arg_cmp) { + self.comparators.push(cmp); + } +} + +#[derive(Debug)] +struct FilterContext { + ctx: *mut scmp_filter_ctx, +} + +impl FilterContext { + pub fn default(default_action: u32) -> Result { + let filter_ctx = unsafe { seccomp_init(default_action) }; + if filter_ctx.is_null() { + bail!("Failed to initialized seccomp profile") + } + + Ok(FilterContext { ctx: filter_ctx }) + } + + pub fn add_rule(&mut self, rule: &Rule) -> Result<()> { + let res = match rule.comparators.len() { + 0 => unsafe { seccomp_rule_add(self.ctx, rule.action, rule.syscall_nr, 0) }, + _ => unsafe { + seccomp_rule_add_array( + self.ctx, + rule.action, + rule.syscall_nr, + rule.comparators.len() as u32, + rule.comparators.as_slice().as_ptr(), + ) + }, + }; + if res != 0 { + bail!("Failed to add rule. Errno: {}, Rule: {:?}", res, rule); + } + + Ok(()) + } + + pub fn add_arch(&mut self, arch: u32) -> Result<()> { + let res = unsafe { seccomp_arch_add(self.ctx, arch) }; + if res != 0 { + if nix::Error::from_i32(res.abs()) != nix::Error::EEXIST { + // The architecture already existed in the profile, so we can + // safely ignore the error here. Otherwise, error out. + bail!("Failed to add architecture {}. Errno: {}", arch, res); + } + } + + Ok(()) + } + + pub fn load(&self) -> Result<()> { + let res = unsafe { seccomp_load(self.ctx) }; + if res != 0 { + bail!("Failed to load seccomp profile: {}", res); + } + + Ok(()) + } +} + +fn translate_syscall(syscall_name: String) -> Result { + let c_syscall_name = CString::new(syscall_name.as_str()) + .with_context(|| format!("Failed to convert syscall {:?} to cstring", syscall_name))?; + let res = unsafe { seccomp_syscall_resolve_name(c_syscall_name.as_ptr()) }; + if res == __NR_SCMP_ERROR { + bail!("Failed to resolve syscall from name: {:?}", syscall_name); + } + + Ok(res) +} + +fn translate_action(action: &LinuxSeccompAction, errno: Option) -> u32 { + let errno = errno.unwrap_or(libc::EPERM as u32); + match action { + LinuxSeccompAction::ScmpActKill => SCMP_ACT_KILL, + LinuxSeccompAction::ScmpActTrap => SCMP_ACT_TRAP, + LinuxSeccompAction::ScmpActErrno => SCMP_ACT_ERRNO(errno), + LinuxSeccompAction::ScmpActTrace => SCMP_ACT_TRACE(errno), + LinuxSeccompAction::ScmpActAllow => SCMP_ACT_ALLOW, + LinuxSeccompAction::ScmpActKillProcess => SCMP_ACT_KILL_PROCESS, + LinuxSeccompAction::ScmpActNotify => SCMP_ACT_NOTIFY, + LinuxSeccompAction::ScmpActLog => SCMP_ACT_LOG, + } +} + +fn translate_op(op: &LinuxSeccompOperator) -> scmp_compare { + match op { + LinuxSeccompOperator::ScmpCmpNe => SCMP_CMP_NE, + LinuxSeccompOperator::ScmpCmpLt => SCMP_CMP_LT, + LinuxSeccompOperator::ScmpCmpLe => SCMP_CMP_LE, + LinuxSeccompOperator::ScmpCmpEq => SCMP_CMP_EQ, + LinuxSeccompOperator::ScmpCmpGe => SCMP_CMP_GE, + LinuxSeccompOperator::ScmpCmpGt => SCMP_CMP_GT, + LinuxSeccompOperator::ScmpCmpMaskedEq => SCMP_CMP_MASKED_EQ, + } +} + +fn translate_arch(arch: &Arch) -> scmp_arch { + match arch { + Arch::ScmpArchNative => SCMP_ARCH_NATIVE, + Arch::ScmpArchX86 => SCMP_ARCH_X86, + Arch::ScmpArchX86_64 => SCMP_ARCH_X86_64, + Arch::ScmpArchX32 => SCMP_ARCH_X32, + Arch::ScmpArchArm => SCMP_ARCH_ARM, + Arch::ScmpArchAarch64 => SCMP_ARCH_AARCH64, + Arch::ScmpArchMips => SCMP_ARCH_MIPS, + Arch::ScmpArchMips64 => SCMP_ARCH_MIPS64, + Arch::ScmpArchMips64n32 => SCMP_ARCH_MIPS64N32, + Arch::ScmpArchMipsel => SCMP_ARCH_MIPSEL, + Arch::ScmpArchMipsel64 => SCMP_ARCH_MIPSEL64, + Arch::ScmpArchMipsel64n32 => SCMP_ARCH_MIPSEL64N32, + Arch::ScmpArchPpc => SCMP_ARCH_PPC, + Arch::ScmpArchPpc64 => SCMP_ARCH_PPC64, + Arch::ScmpArchPpc64le => SCMP_ARCH_PPC64LE, + Arch::ScmpArchS390 => SCMP_ARCH_S390, + Arch::ScmpArchS390x => SCMP_ARCH_S390X, + } +} + +pub fn initialize_seccomp(seccomp: Option<&LinuxSeccomp>) -> Result<()> { + if seccomp.is_none() { + return Ok(()); + } + + let seccomp = seccomp.unwrap(); + if seccomp.flags.is_some() { + // runc did not support this, so let's skip it for now. + bail!("seccomp flags are not yet supported"); + } + + // log::debug!("XXX seccomp: {:?}", seccomp); + + // TODO: fix default action error number. The spec repo doesn't have it yet. + let default_action = translate_action(&seccomp.default_action, None); + let mut ctx = FilterContext::default(default_action)?; + + if let Some(architectures) = seccomp.architectures.as_ref() { + for arch in architectures { + let arch_token = translate_arch(arch); + ctx.add_arch(arch_token as u32) + .context("Failed to add arch to seccomp")?; + } + } + + if let Some(syscalls) = seccomp.syscalls.as_ref() { + for syscall in syscalls { + let action = translate_action(&syscall.action, syscall.errno_ret); + if action == default_action { + // When the action is the same as the default action, the rule is redundent. We can + // skip this here to avoid failing when we add the rules. + log::warn!( + "Detect a seccomp action that is the same as the default action: {:?}", + syscall + ); + continue; + } + + for name in &syscall.names { + let ret = translate_syscall(name.clone()); + if ret.is_err() { + // If we failed to resolve the syscall by name, likely the kernel + // doeesn't support this syscall. So it is safe to skip... + log::warn!( + "Failed to resolve syscall, likely kernel doesn't support this. {:?}", + name + ); + continue; + } + + let syscall_number = translate_syscall(name.clone())?; + // Not clear why but if there are multiple arg attached to one + // syscall rule, we have to add them seperatly. add_rule will + // return EINVAL. runc does the same but doesn't explain why. + match syscall.args.as_ref() { + Some(args) => { + for arg in args { + let mut rule = Rule::new(action, syscall_number); + let cmp = Compare::new(arg.index as u32) + .op(translate_op(&arg.op)) + .datum_a(arg.value) + .datum_b(arg.value_two.unwrap_or(0)) + .build() + .context("Failed to build a seccomp compare rule")?; + rule.add_comparator(cmp); + ctx.add_rule(&rule).with_context(|| { + format!( + "Failed to add seccomp rule: {:?}. Syscall: {:?}", + &rule, name, + ) + })?; + } + } + None => { + let rule = Rule::new(action, syscall_number); + ctx.add_rule(&rule).with_context(|| { + format!( + "Failed to add seccomp rule: {:?}. Syscall: {:?}", + &rule, name, + ) + })?; + } + } + } + } + } + + let _ = prctl::set_no_new_privileges(true); + ctx.load().context("Failed to load seccomp context")?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use anyhow::Result; + use mio::unix::pipe; + use nix::sys::wait; + use oci_spec::runtime::{Arch, LinuxSeccomp, LinuxSyscall}; + use serial_test::serial; + use std::io::Read; + use std::io::Write; + use std::os::unix::prelude::AsRawFd; + use std::path; + + #[test] + #[serial] + fn test_basic() -> Result<()> { + // Note: seccomp profile is really hard to write unit test for. First, + // we can't really test default error or kill action, since rust test + // actually replies on certain syscalls. Second, some of the syscall + // will not return errorno. These syscalls will just send an abort + // signal or even just segfaults. Here we choose to use `getcwd` + // syscall for testing. This is more of a sanity check. + + let mut seccomp_profile = LinuxSeccomp::default(); + seccomp_profile.default_action = LinuxSeccompAction::ScmpActAllow; + seccomp_profile.architectures = Some(vec![Arch::ScmpArchNative]); + let mut seccomp_syscall = LinuxSyscall::default(); + // Here, we choose an error that getcwd call would never return on its own, so + // we can make sure that getcwd failed because of seccomp rule. + let expect_error = libc::EAGAIN; + seccomp_syscall.names = vec![String::from("getcwd"), String::from("setuid")]; + seccomp_syscall.action = LinuxSeccompAction::ScmpActErrno; + seccomp_syscall.errno_ret = Some(expect_error as u32); + seccomp_profile.syscalls = Some(vec![seccomp_syscall]); + + // Since Rust cargo test uses a single process to execute all tests, it + // is a good idea to fork a child process to test the seccomp profile, + // and then kill the process. This way, the main test process is + // unaffected. The child process will pass the returned error code + // to the parent for assert and checking. + let (mut sender, mut receiver) = pipe::new()?; + receiver + .set_nonblocking(false) + .with_context(|| "Failed to set channel receiver to blocking")?; + + match unsafe { nix::unistd::fork()? } { + nix::unistd::ForkResult::Parent { child } => { + nix::unistd::close(sender.as_raw_fd())?; + let mut buf = [0; 4]; + receiver + .read_exact(&mut buf) + .context("Failed to wait from child")?; + assert_eq!(i32::from_be_bytes(buf), expect_error); + wait::waitpid(child, None)?; + } + nix::unistd::ForkResult::Child => { + nix::unistd::close(receiver.as_raw_fd())?; + initialize_seccomp(Some(&seccomp_profile))?; + let ret = nix::unistd::getcwd(); + let errno: i32 = if ret.is_err() { + ret.err().unwrap() as i32 + } else { + 0 + }; + sender.write_all(&errno.to_be_bytes())?; + std::process::exit(errno); + } + } + + Ok(()) + } + + #[test] + #[serial] + fn test_moby() -> Result<()> { + let fixture_path = + path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/seccomp/fixture/config.json"); + let spec = oci_spec::runtime::Spec::load(fixture_path) + .context("Failed to load test spec for seccomp")?; + + // We know linux and seccomp exist, so let's just unwrap. + let seccomp_profile = spec.linux.unwrap().seccomp.unwrap(); + match unsafe { nix::unistd::fork()? } { + nix::unistd::ForkResult::Parent { child } => { + let status = wait::waitpid(child, None)?; + match status { + wait::WaitStatus::Exited(_, exit_code) => { + assert_eq!( + exit_code, 0, + "Child process didn't configure seccomp profile correctly" + ); + } + _ => { + bail!("Child process failed to exit correctly: {:?}", status); + } + } + } + nix::unistd::ForkResult::Child => { + let ret = initialize_seccomp(Some(&seccomp_profile)); + let exit_code = if ret.is_ok() { 0 } else { -1 }; + std::process::exit(exit_code); + } + } + + Ok(()) + } +}