diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 72f98fdde..38a7ea594 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -43,7 +43,7 @@ jobs: working-directory: ./cgroups - run: rustup component add rustfmt clippy - run: sudo apt-get -y update - - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Check formatting run: cargo fmt --all -- --check working-directory: ${{matrix.dirs}} @@ -68,7 +68,7 @@ jobs: with: working-directory: ./cgroups - run: sudo apt-get -y update - - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Run tests run: cargo test --all --all-features --no-fail-fast coverage: @@ -98,7 +98,7 @@ jobs: - name: Update System Libraries run: sudo apt-get -y update - name: Install System Libraries - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Run Test Coverage for youki run: | cargo llvm-cov clean --workspace @@ -143,7 +143,7 @@ jobs: with: working-directory: ./cgroups - run: sudo apt-get -y update - - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev - name: Build run: ./build.sh --release - uses: actions/setup-go@v2 diff --git a/Cargo.lock b/Cargo.lock index 2903e044a..b37742cbb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -890,6 +890,13 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "seccomp" +version = "0.1.0" +dependencies = [ + "libc", +] + [[package]] name = "serde" version = "1.0.130" @@ -1151,6 +1158,7 @@ dependencies = [ "prctl", "procfs", "quickcheck", + "seccomp", "serde", "serde_json", "serial_test", diff --git a/Cargo.toml b/Cargo.toml index 856d3db63..7ab21fe67 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ description = "A container runtime written in Rust" [workspace] members = [ "cgroups", + "seccomp", ] [features] @@ -41,6 +42,7 @@ dbus = "0.9.2" tabwriter = "1" fastrand = "1.4.1" crossbeam-channel = "0.5" +seccomp = { version = "0.1.0", path = "./seccomp" } [dev-dependencies] oci-spec = { git = "https://github.com/utam0k/oci-spec-rs/", tag = "v0.4.0-with-bugfix", features = ["proptests"] } diff --git a/README.md b/README.md index d5e42530a..52cf61971 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,8 @@ $ sudo apt-get install \ libsystemd-dev \ libdbus-glib-1-dev \ build-essential \ - libelf-dev + libelf-dev \ + libseccomp-dev ``` ### Fedora, Centos, RHEL and related distributions @@ -86,6 +87,7 @@ $ sudo dnf install \ systemd-devel \ dbus-devel \ elfutils-libelf-devel \ + libseccomp-devel ``` ## Build diff --git a/integration_test.sh b/integration_test.sh index 6db7ceb24..fa362bfc7 100755 --- a/integration_test.sh +++ b/integration_test.sh @@ -48,7 +48,7 @@ test_cases=( # "linux_process_apparmor_profile/linux_process_apparmor_profile.t" "linux_readonly_paths/linux_readonly_paths.t" # "linux_rootfs_propagation/linux_rootfs_propagation.t" - # "linux_seccomp/linux_seccomp.t" + "linux_seccomp/linux_seccomp.t" "linux_sysctl/linux_sysctl.t" "linux_uid_mappings/linux_uid_mappings.t" "misc_props/misc_props.t" diff --git a/seccomp/Cargo.toml b/seccomp/Cargo.toml new file mode 100644 index 000000000..097d5e332 --- /dev/null +++ b/seccomp/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "seccomp" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +libc = "0.2.84" \ No newline at end of file diff --git a/seccomp/README.md b/seccomp/README.md new file mode 100644 index 000000000..31f7ae3e0 --- /dev/null +++ b/seccomp/README.md @@ -0,0 +1,11 @@ +# Bindings to libseccomp + +This crate contains a rust FFI binding to +[libseccomp](https://github.com/seccomp/libseccomp). + +The code is adapted from auto generated code using +[rust-bindgen](https://github.com/rust-lang/rust-bindgen). The `rust-bindgen` +has some issue with detecting function macro, which `libseccomp` uses. We +decided to manually fix the issue and include the bindings in this crate. + +The header file used: diff --git a/seccomp/src/lib.rs b/seccomp/src/lib.rs new file mode 100644 index 000000000..116e9a622 --- /dev/null +++ b/seccomp/src/lib.rs @@ -0,0 +1,661 @@ +extern crate libc; + +#[allow(non_camel_case_types)] +pub type __s8 = ::std::os::raw::c_schar; +#[allow(non_camel_case_types)] +pub type __u8 = ::std::os::raw::c_uchar; +#[allow(non_camel_case_types)] +pub type __s16 = ::std::os::raw::c_short; +#[allow(non_camel_case_types)] +pub type __u16 = ::std::os::raw::c_ushort; +#[allow(non_camel_case_types)] +pub type __s32 = ::std::os::raw::c_int; +#[allow(non_camel_case_types)] +pub type __u32 = ::std::os::raw::c_uint; +#[allow(non_camel_case_types)] +pub type __s64 = ::std::os::raw::c_longlong; +#[allow(non_camel_case_types)] +pub type __u64 = ::std::os::raw::c_ulonglong; + +pub const SCMP_VER_MAJOR: u32 = 2; +pub const SCMP_VER_MINOR: u32 = 5; +pub const SCMP_VER_MICRO: u32 = 1; + +pub const __NR_SCMP_ERROR: i32 = -1; +pub const __NR_SCMP_UNDEF: i32 = -2; + +#[allow(non_camel_case_types)] +pub type scmp_arch = u32; +pub const SCMP_ARCH_NATIVE: scmp_arch = 0; +pub const SCMP_ARCH_X86: scmp_arch = 1073741827; +pub const SCMP_ARCH_X86_64: scmp_arch = 3221225534; +pub const SCMP_ARCH_X32: scmp_arch = 1073741886; +pub const SCMP_ARCH_ARM: scmp_arch = 1073741864; +pub const SCMP_ARCH_AARCH64: scmp_arch = 3221225655; +pub const SCMP_ARCH_MIPS: scmp_arch = 8; +pub const SCMP_ARCH_MIPS64: scmp_arch = 2147483656; +pub const SCMP_ARCH_MIPS64N32: scmp_arch = 2684354568; +pub const SCMP_ARCH_MIPSEL: scmp_arch = 1073741832; +pub const SCMP_ARCH_MIPSEL64: scmp_arch = 3221225480; +pub const SCMP_ARCH_MIPSEL64N32: scmp_arch = 3758096392; +pub const SCMP_ARCH_PPC: scmp_arch = 20; +pub const SCMP_ARCH_PPC64: scmp_arch = 2147483669; +pub const SCMP_ARCH_PPC64LE: scmp_arch = 3221225493; +pub const SCMP_ARCH_S390: scmp_arch = 22; +pub const SCMP_ARCH_S390X: scmp_arch = 2147483670; +pub const SCMP_ARCH_PARISC: scmp_arch = 15; +pub const SCMP_ARCH_PARISC64: scmp_arch = 2147483663; +pub const SCMP_ARCH_RISCV64: scmp_arch = 3221225715; + +pub const SCMP_ACT_KILL_PROCESS: u32 = 2147483648; +pub const SCMP_ACT_KILL_THREAD: u32 = 0; +pub const SCMP_ACT_KILL: u32 = 0; +pub const SCMP_ACT_TRAP: u32 = 196608; +pub const SCMP_ACT_NOTIFY: u32 = 2143289344; +pub const SCMP_ACT_LOG: u32 = 2147221504; +pub const SCMP_ACT_ALLOW: u32 = 2147418112; +#[allow(non_snake_case)] +pub fn SCMP_ACT_ERRNO(x: u32) -> u32 { + 0x00050000 | ((x) & 0x0000ffff) +} +#[allow(non_snake_case)] +pub fn SCMP_ACT_TRACE(x: u32) -> u32 { + 0x7ff00000 | ((x) & 0x0000ffff) +} + +#[allow(non_camel_case_types)] +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub enum scmp_filter_attr { + _SCMP_FLTATR_MIN, + SCMP_FLTATR_ACT_DEFAULT, + SCMP_FLTATR_ACT_BADARCH, + SCMP_FLTATR_CTL_NNP, + SCMP_FLTATR_CTL_TSYNC, + SCMP_FLTATR_API_TSKIP, + SCMP_FLTATR_CTL_LOG, + SCMP_FLTATR_CTL_SSB, + SCMP_FLTATR_CTL_OPTIMIZE, + SCMP_FLTATR_API_SYSRAWRC, + _SCMP_FLTATR_MAX, +} + +#[allow(non_camel_case_types)] +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub enum scmp_compare { + _SCMP_CMP_MIN = 0, + SCMP_CMP_NE = 1, + SCMP_CMP_LT = 2, + SCMP_CMP_LE = 3, + SCMP_CMP_EQ = 4, + SCMP_CMP_GE = 5, + SCMP_CMP_GT = 6, + SCMP_CMP_MASKED_EQ = 7, + _SCMP_CMP_MAX = 8, +} + +#[allow(non_camel_case_types)] +pub type scmp_datum_t = u64; + +#[allow(non_camel_case_types)] +pub type scmp_filter_ctx = *mut ::std::os::raw::c_void; + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct scmp_version { + pub major: ::std::os::raw::c_uint, + pub minor: ::std::os::raw::c_uint, + pub micro: ::std::os::raw::c_uint, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct scmp_arg_cmp { + pub arg: ::std::os::raw::c_uint, + pub op: scmp_compare, + pub datum_a: scmp_datum_t, + pub datum_b: scmp_datum_t, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct seccomp_data { + pub nr: ::std::os::raw::c_int, + pub arch: __u32, + pub instruction_pointer: __u64, + pub args: [__u64; 6usize], +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct seccomp_notif_sizes { + pub seccomp_notif: __u16, + pub seccomp_notif_resp: __u16, + pub seccomp_data: __u16, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct seccomp_notif { + pub id: __u64, + pub pid: __u32, + pub flags: __u32, + pub data: seccomp_data, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct seccomp_notif_resp { + pub id: __u64, + pub val: __s64, + pub error: __s32, + pub flags: __u32, +} + +#[link(name = "seccomp")] +extern "C" { + /** + * Query the library version information + * + * This function returns a pointer to a populated scmp_version struct, the + * caller does not need to free the structure when finished. + * + */ + pub fn seccomp_version() -> *const scmp_version; + + /** + * Query the library's level of API support + * + * This function returns an API level value indicating the current supported + * functionality. It is important to note that this level of support is + * determined at runtime and therefore can change based on the running kernel + * and system configuration (e.g. any previously loaded seccomp filters). This + * function can be called multiple times, but it only queries the system the + * first time it is called, the API level is cached and used in subsequent + * calls. + * + * The current API levels are described below: + * 0 : reserved + * 1 : base level + * 2 : support for the SCMP_FLTATR_CTL_TSYNC filter attribute + * uses the seccomp(2) syscall instead of the prctl(2) syscall + * 3 : support for the SCMP_FLTATR_CTL_LOG filter attribute + * support for the SCMP_ACT_LOG action + * support for the SCMP_ACT_KILL_PROCESS action + * 4 : support for the SCMP_FLTATR_CTL_SSB filter attrbute + * 5 : support for the SCMP_ACT_NOTIFY action and notify APIs + * 6 : support the simultaneous use of SCMP_FLTATR_CTL_TSYNC and notify APIs + * + */ + pub fn seccomp_api_get() -> ::std::os::raw::c_uint; + + /** + * Set the library's level of API support + * + * This function forcibly sets the API level of the library at runtime. Valid + * API levels are discussed in the description of the seccomp_api_get() + * function. General use of this function is strongly discouraged. + * + */ + pub fn seccomp_api_set(level: ::std::os::raw::c_uint) -> ::std::os::raw::c_int; + + /** + * Initialize the filter state + * @param def_action the default filter action + * + * This function initializes the internal seccomp filter state and should + * be called before any other functions in this library to ensure the filter + * state is initialized. Returns a filter context on success, NULL on failure. + * + */ + pub fn seccomp_init(def_action: u32) -> scmp_filter_ctx; + + /** + * Reset the filter state + * @param ctx the filter context + * @param def_action the default filter action + * + * This function resets the given seccomp filter state and ensures the + * filter state is reinitialized. This function does not reset any seccomp + * filters already loaded into the kernel. Returns zero on success, negative + * values on failure. + * + */ + pub fn seccomp_reset(ctx: scmp_filter_ctx, def_action: u32) -> ::std::os::raw::c_int; + + /** + * Destroys the filter state and releases any resources + * @param ctx the filter context + * + * This functions destroys the given seccomp filter state and releases any + * resources, including memory, associated with the filter state. This + * function does not reset any seccomp filters already loaded into the kernel. + * The filter context can no longer be used after calling this function. + * + */ + pub fn seccomp_release(ctx: scmp_filter_ctx); + + /** + * Merge two filters + * @param ctx_dst the destination filter context + * @param ctx_src the source filter context + * + * This function merges two filter contexts into a single filter context and + * destroys the second filter context. The two filter contexts must have the + * same attribute values and not contain any of the same architectures; if they + * do, the merge operation will fail. On success, the source filter context + * will be destroyed and should no longer be used; it is not necessary to + * call seccomp_release() on the source filter context. Returns zero on + * success, negative values on failure. + * + */ + pub fn seccomp_merge( + ctx_dst: scmp_filter_ctx, + ctx_src: scmp_filter_ctx, + ) -> ::std::os::raw::c_int; + + /** + * Resolve the architecture name to a architecture token + * @param arch_name the architecture name + * + * This function resolves the given architecture name to a token suitable for + * use with libseccomp, returns zero on failure. + * + */ + pub fn seccomp_arch_resolve_name(arch_name: *const ::std::os::raw::c_char) -> u32; + + /** + * Return the native architecture token + * + * This function returns the native architecture token value, e.g. SCMP_ARCH_*. + * + */ + pub fn seccomp_arch_native() -> u32; + + /** + * Check to see if an existing architecture is present in the filter + * @param ctx the filter context + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * + * This function tests to see if a given architecture is included in the filter + * context. If the architecture token is SCMP_ARCH_NATIVE then the native + * architecture will be assumed. Returns zero if the architecture exists in + * the filter, -EEXIST if it is not present, and other negative values on + * failure. + * + */ + pub fn seccomp_arch_exist(ctx: scmp_filter_ctx, arch_token: u32) -> ::std::os::raw::c_int; + + /** + * Adds an architecture to the filter + * @param ctx the filter context + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * + * This function adds a new architecture to the given seccomp filter context. + * Any new rules added after this function successfully returns will be added + * to this architecture but existing rules will not be added to this + * architecture. If the architecture token is SCMP_ARCH_NATIVE then the native + * architecture will be assumed. Returns zero on success, -EEXIST if + * specified architecture is already present, other negative values on failure. + * + */ + pub fn seccomp_arch_add(ctx: scmp_filter_ctx, arch_token: u32) -> ::std::os::raw::c_int; + + /** + * Removes an architecture from the filter + * @param ctx the filter context + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * + * This function removes an architecture from the given seccomp filter context. + * If the architecture token is SCMP_ARCH_NATIVE then the native architecture + * will be assumed. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_arch_remove(ctx: scmp_filter_ctx, arch_token: u32) -> ::std::os::raw::c_int; + + /** + * Loads the filter into the kernel + * @param ctx the filter context + * + * This function loads the given seccomp filter context into the kernel. If + * the filter was loaded correctly, the kernel will be enforcing the filter + * when this function returns. Returns zero on success, negative values on + * error. + * + */ + pub fn seccomp_load(ctx: scmp_filter_ctx) -> ::std::os::raw::c_int; + + /** + * Get the value of a filter attribute + * @param ctx the filter context + * @param attr the filter attribute name + * @param value the filter attribute value + * + * This function fetches the value of the given attribute name and returns it + * via @value. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_attr_get( + ctx: scmp_filter_ctx, + attr: scmp_filter_attr, + value: *mut u32, + ) -> ::std::os::raw::c_int; + + /** + * Set the value of a filter attribute + * @param ctx the filter context + * @param attr the filter attribute name + * @param value the filter attribute value + * + * This function sets the value of the given attribute. Returns zero on + * success, negative values on failure. + * + */ + pub fn seccomp_attr_set( + ctx: scmp_filter_ctx, + attr: scmp_filter_attr, + value: u32, + ) -> ::std::os::raw::c_int; + + /** + * Resolve a syscall number to a name + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * @param num the syscall number + * + * Resolve the given syscall number to the syscall name for the given + * architecture; it is up to the caller to free the returned string. Returns + * the syscall name on success, NULL on failure. + * + */ + pub fn seccomp_syscall_resolve_num_arch( + arch_token: u32, + num: ::std::os::raw::c_int, + ) -> *mut ::std::os::raw::c_char; + + /** + * Resolve a syscall name to a number + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * @param name the syscall name + * + * Resolve the given syscall name to the syscall number for the given + * architecture. Returns the syscall number on success, including negative + * pseudo syscall numbers (e.g. __PNR_*); returns __NR_SCMP_ERROR on failure. + * + */ + pub fn seccomp_syscall_resolve_name_arch( + arch_token: u32, + name: *const ::std::os::raw::c_char, + ) -> ::std::os::raw::c_int; + + /** + * Resolve a syscall name to a number and perform any rewriting necessary + * @param arch_token the architecture token, e.g. SCMP_ARCH_* + * @param name the syscall name + * + * Resolve the given syscall name to the syscall number for the given + * architecture and do any necessary syscall rewriting needed by the + * architecture. Returns the syscall number on success, including negative + * pseudo syscall numbers (e.g. __PNR_*); returns __NR_SCMP_ERROR on failure. + * + */ + pub fn seccomp_syscall_resolve_name_rewrite( + arch_token: u32, + name: *const ::std::os::raw::c_char, + ) -> ::std::os::raw::c_int; + + /** + * Resolve a syscall name to a number + * @param name the syscall name + * + * Resolve the given syscall name to the syscall number. Returns the syscall + * number on success, including negative pseudo syscall numbers (e.g. __PNR_*); + * returns __NR_SCMP_ERROR on failure. + * + */ + pub fn seccomp_syscall_resolve_name( + name: *const ::std::os::raw::c_char, + ) -> ::std::os::raw::c_int; + + /** + * Set the priority of a given syscall + * @param ctx the filter context + * @param syscall the syscall number + * @param priority priority value, higher value == higher priority + * + * This function sets the priority of the given syscall; this value is used + * when generating the seccomp filter code such that higher priority syscalls + * will incur less filter code overhead than the lower priority syscalls in the + * filter. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_syscall_priority( + ctx: scmp_filter_ctx, + syscall: ::std::os::raw::c_int, + priority: u8, + ) -> ::std::os::raw::c_int; + + /** + * Add a new rule to the filter + * @param ctx the filter context + * @param action the filter action + * @param syscall the syscall number + * @param arg_cnt the number of argument filters in the argument filter chain + * @param ... scmp_arg_cmp structs (use of SCMP_ARG_CMP() recommended) + * + * This function adds a series of new argument/value checks to the seccomp + * filter for the given syscall; multiple argument/value checks can be + * specified and they will be chained together (AND'd together) in the filter. + * If the specified rule needs to be adjusted due to architecture specifics it + * will be adjusted without notification. Returns zero on success, negative + * values on failure. + * + */ + pub fn seccomp_rule_add( + ctx: scmp_filter_ctx, + action: u32, + syscall: ::std::os::raw::c_int, + arg_cnt: ::std::os::raw::c_uint, + ... + ) -> ::std::os::raw::c_int; + + /** + * Add a new rule to the filter + * @param ctx the filter context + * @param action the filter action + * @param syscall the syscall number + * @param arg_cnt the number of elements in the arg_array parameter + * @param arg_array array of scmp_arg_cmp structs + * + * This function adds a series of new argument/value checks to the seccomp + * filter for the given syscall; multiple argument/value checks can be + * specified and they will be chained together (AND'd together) in the filter. + * If the specified rule needs to be adjusted due to architecture specifics it + * will be adjusted without notification. Returns zero on success, negative + * values on failure. + * + */ + pub fn seccomp_rule_add_array( + ctx: scmp_filter_ctx, + action: u32, + syscall: ::std::os::raw::c_int, + arg_cnt: ::std::os::raw::c_uint, + arg_array: *const scmp_arg_cmp, + ) -> ::std::os::raw::c_int; + + /** + * Add a new rule to the filter + * @param ctx the filter context + * @param action the filter action + * @param syscall the syscall number + * @param arg_cnt the number of argument filters in the argument filter chain + * @param ... scmp_arg_cmp structs (use of SCMP_ARG_CMP() recommended) + * + * This function adds a series of new argument/value checks to the seccomp + * filter for the given syscall; multiple argument/value checks can be + * specified and they will be chained together (AND'd together) in the filter. + * If the specified rule can not be represented on the architecture the + * function will fail. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_rule_add_exact( + ctx: scmp_filter_ctx, + action: u32, + syscall: ::std::os::raw::c_int, + arg_cnt: ::std::os::raw::c_uint, + ... + ) -> ::std::os::raw::c_int; + + /** + * Add a new rule to the filter + * @param ctx the filter context + * @param action the filter action + * @param syscall the syscall number + * @param arg_cnt the number of elements in the arg_array parameter + * @param arg_array array of scmp_arg_cmp structs + * + * This function adds a series of new argument/value checks to the seccomp + * filter for the given syscall; multiple argument/value checks can be + * specified and they will be chained together (AND'd together) in the filter. + * If the specified rule can not be represented on the architecture the + * function will fail. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_rule_add_exact_array( + ctx: scmp_filter_ctx, + action: u32, + syscall: ::std::os::raw::c_int, + arg_cnt: ::std::os::raw::c_uint, + arg_array: *const scmp_arg_cmp, + ) -> ::std::os::raw::c_int; + + /** + * Allocate a pair of notification request/response structures + * @param req the request location + * @param resp the response location + * + * This function allocates a pair of request/response structure by computing + * the correct sized based on the currently running kernel. It returns zero on + * success, and negative values on failure. + * + */ + pub fn seccomp_notify_alloc( + req: *mut *mut seccomp_notif, + resp: *mut *mut seccomp_notif_resp, + ) -> ::std::os::raw::c_int; + + /** + * Free a pair of notification request/response structures. + * @param req the request location + * @param resp the response location + */ + pub fn seccomp_notify_free(req: *mut seccomp_notif, resp: *mut seccomp_notif_resp); + + /** + * Receive a notification from a seccomp notification fd + * @param fd the notification fd + * @param req the request buffer to save into + * + * Blocks waiting for a notification on this fd. This function is thread safe + * (synchronization is performed in the kernel). Returns zero on success, + * negative values on error. + * + */ + pub fn seccomp_notify_receive( + fd: ::std::os::raw::c_int, + req: *mut seccomp_notif, + ) -> ::std::os::raw::c_int; + + /** + * Send a notification response to a seccomp notification fd + * @param fd the notification fd + * @param resp the response buffer to use + * + * Sends a notification response on this fd. This function is thread safe + * (synchronization is performed in the kernel). Returns zero on success, + * negative values on error. + * + */ + pub fn seccomp_notify_respond( + fd: ::std::os::raw::c_int, + resp: *mut seccomp_notif_resp, + ) -> ::std::os::raw::c_int; + + /** + * Check if a notification id is still valid + * @param fd the notification fd + * @param id the id to test + * + * Checks to see if a notification id is still valid. Returns 0 on success, and + * negative values on failure. + * + */ + pub fn seccomp_notify_id_valid(fd: ::std::os::raw::c_int, id: u64) -> ::std::os::raw::c_int; + + /** + * Return the notification fd from a filter that has already been loaded + * @param ctx the filter context + * + * This returns the listener fd that was generated when the seccomp policy was + * loaded. This is only valid after seccomp_load() with a filter that makes + * use of SCMP_ACT_NOTIFY. + * + */ + pub fn seccomp_notify_fd(ctx: scmp_filter_ctx) -> ::std::os::raw::c_int; + + /** + * Generate seccomp Pseudo Filter Code (PFC) and export it to a file + * @param ctx the filter context + * @param fd the destination fd + * + * This function generates seccomp Pseudo Filter Code (PFC) and writes it to + * the given fd. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_export_pfc( + ctx: scmp_filter_ctx, + fd: ::std::os::raw::c_int, + ) -> ::std::os::raw::c_int; + + /** + * Generate seccomp Berkeley Packet Filter (BPF) code and export it to a file + * @param ctx the filter context + * @param fd the destination fd + * + * This function generates seccomp Berkeley Packer Filter (BPF) code and writes + * it to the given fd. Returns zero on success, negative values on failure. + * + */ + pub fn seccomp_export_bpf( + ctx: scmp_filter_ctx, + fd: ::std::os::raw::c_int, + ) -> ::std::os::raw::c_int; + +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + // Note: we should probably run this in a different process, since it + // loads a seccomp profile. However, since this is the only test in the + // repo at the moment, this should be OK for now. + unsafe { + let ctx = seccomp_init(SCMP_ACT_ALLOW); + let cmp = scmp_arg_cmp { + arg: 0, + op: scmp_compare::SCMP_CMP_EQ, + datum_a: 1000, + datum_b: 0, + }; + + let c_syscall_name = std::ffi::CString::new("getcwd").unwrap(); + let syscall_number = seccomp_syscall_resolve_name(c_syscall_name.as_ptr()); + + assert!(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(42), syscall_number, 1, cmp) == 0); + assert!(seccomp_load(ctx) == 0); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 6a20a35fd..da320ee1e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ pub mod notify_socket; pub mod process; pub mod rootfs; pub mod rootless; +pub mod seccomp; pub mod signal; pub mod syscall; pub mod tty; diff --git a/src/process/init.rs b/src/process/init.rs index 03eb97e8d..bacd27990 100644 --- a/src/process/init.rs +++ b/src/process/init.rs @@ -1,3 +1,8 @@ +use super::args::ContainerArgs; +use crate::{ + capabilities, hooks, namespaces::Namespaces, process::channel, rootfs, rootless::Rootless, + seccomp, syscall::Syscall, tty, utils, +}; use anyhow::{bail, Context, Result}; use nix::mount::mount as nix_mount; use nix::mount::MsFlags; @@ -9,17 +14,12 @@ use nix::{ }; use oci_spec::runtime::{LinuxNamespaceType, User}; use std::collections::HashMap; -use std::{env, os::unix::io::AsRawFd}; -use std::{fs, path::Path, path::PathBuf}; - -use crate::rootless::Rootless; -use crate::{ - capabilities, hooks, namespaces::Namespaces, process::channel, rootfs, syscall::Syscall, tty, - utils, +use std::{ + env, fs, + os::unix::io::AsRawFd, + path::{Path, PathBuf}, }; -use super::args::ContainerArgs; - // Make sure a given path is on procfs. This is to avoid the security risk that // /proc path is mounted over. Ref: CVE-2019-16884 fn ensure_procfs(path: &Path) -> Result<()> { @@ -299,6 +299,14 @@ pub fn container_init( .set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid)) .context("Failed to configure uid and gid")?; + // Without no new privileges, seccomp is a privileged operation. We have to + // do this before dropping capabilities. Otherwise, we should do it later, + // as close to exec as possible. + if linux.seccomp.is_some() && proc.no_new_privileges.is_none() { + seccomp::initialize_seccomp(linux.seccomp.as_ref().unwrap()) + .context("Failed to execute seccomp")?; + } + capabilities::reset_effective(command).context("Failed to reset effective capabilities")?; if let Some(caps) = &proc.capabilities { capabilities::drop_privileges(caps, command).context("Failed to drop capabilities")?; @@ -377,6 +385,13 @@ pub fn container_init( } } + if linux.seccomp.is_some() && proc.no_new_privileges.is_some() { + // Initialize seccomp profile right before we are ready to execute the + // payload. The notify socket will still need network related syscalls. + seccomp::initialize_seccomp(linux.seccomp.as_ref().unwrap()) + .context("Failed to execute seccomp")?; + } + if let Some(args) = proc.args.as_ref() { utils::do_exec(&args[0], args)?; } else { diff --git a/src/seccomp/fixture/config.json b/src/seccomp/fixture/config.json new file mode 100644 index 000000000..04c6764a2 --- /dev/null +++ b/src/seccomp/fixture/config.json @@ -0,0 +1,964 @@ +{ + "ociVersion": "1.0.1-dev", + "process": { + "terminal": false, + "user": { + "uid": 0, + "gid": 0 + }, + "args": [ + "helloworld" + ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "capabilities": { + "bounding": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "effective": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "inheritable": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "permitted": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "ambient": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ] + }, + "rlimits": [ + { + "type": "RLIMIT_NOFILE", + "hard": 1024, + "soft": 1024 + } + ], + "noNewPrivileges": true + }, + "root": { + "path": "tests/assets/oci/helloworld/rootfs" + }, + "hostname": "runc", + "mounts": [ + { + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev", + "type": "tmpfs", + "source": "tmpfs", + "options": [ + "nosuid", + "strictatime", + "mode=755", + "size=65536k" + ] + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + }, + { + "destination": "/dev/shm", + "type": "tmpfs", + "source": "shm", + "options": [ + "nosuid", + "noexec", + "nodev", + "mode=1777", + "size=65536k" + ] + }, + { + "destination": "/dev/mqueue", + "type": "mqueue", + "source": "mqueue", + "options": [ + "nosuid", + "noexec", + "nodev" + ] + }, + { + "destination": "/sys", + "type": "sysfs", + "source": "sysfs", + "options": [ + "nosuid", + "noexec", + "nodev", + "ro" + ] + }, + { + "destination": "/sys/fs/cgroup", + "type": "cgroup", + "source": "cgroup", + "options": [ + "nosuid", + "noexec", + "nodev", + "relatime", + "ro" + ] + } + ], + "linux": { + "devices": [ + { + "path": "/dev/kvm", + "type": "c", + "major": 10, + "minor": 232, + "fileMode": 666, + "uid": 0, + "gid": 36 + } + ], + "seccomp": { + "defaultAction": "SCMP_ACT_ERRNO", + "defaultErrnoRet": 1, + "archMap": [ + { + "architecture": "SCMP_ARCH_X86_64", + "subArchitectures": [ + "SCMP_ARCH_X86", + "SCMP_ARCH_X32" + ] + }, + { + "architecture": "SCMP_ARCH_AARCH64", + "subArchitectures": [ + "SCMP_ARCH_ARM" + ] + }, + { + "architecture": "SCMP_ARCH_MIPS64", + "subArchitectures": [ + "SCMP_ARCH_MIPS", + "SCMP_ARCH_MIPS64N32" + ] + }, + { + "architecture": "SCMP_ARCH_MIPS64N32", + "subArchitectures": [ + "SCMP_ARCH_MIPS", + "SCMP_ARCH_MIPS64" + ] + }, + { + "architecture": "SCMP_ARCH_MIPSEL64", + "subArchitectures": [ + "SCMP_ARCH_MIPSEL", + "SCMP_ARCH_MIPSEL64N32" + ] + }, + { + "architecture": "SCMP_ARCH_MIPSEL64N32", + "subArchitectures": [ + "SCMP_ARCH_MIPSEL", + "SCMP_ARCH_MIPSEL64" + ] + }, + { + "architecture": "SCMP_ARCH_S390X", + "subArchitectures": [ + "SCMP_ARCH_S390" + ] + } + ], + "syscalls": [ + { + "names": [ + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "bind", + "brk", + "capget", + "capset", + "chdir", + "chmod", + "chown", + "chown32", + "clock_adjtime", + "clock_adjtime64", + "clock_getres", + "clock_getres_time64", + "clock_gettime", + "clock_gettime64", + "clock_nanosleep", + "clock_nanosleep_time64", + "close", + "close_range", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "dup3", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_pwait2", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsetxattr", + "fstat", + "fstat64", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futex_time64", + "futimesat", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "get_robust_list", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "get_thread_area", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "ioctl", + "io_destroy", + "io_getevents", + "io_pgetevents", + "io_pgetevents_time64", + "ioprio_get", + "ioprio_set", + "io_setup", + "io_submit", + "io_uring_enter", + "io_uring_register", + "io_uring_setup", + "ipc", + "kill", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listxattr", + "llistxattr", + "_llseek", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "membarrier", + "memfd_create", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedreceive_time64", + "mq_timedsend", + "mq_timedsend_time64", + "mq_unlink", + "mremap", + "msgctl", + "msgget", + "msgrcv", + "msgsnd", + "msync", + "munlock", + "munlockall", + "munmap", + "nanosleep", + "newfstatat", + "_newselect", + "open", + "openat", + "openat2", + "pause", + "pidfd_open", + "pidfd_send_signal", + "pipe", + "pipe2", + "poll", + "ppoll", + "ppoll_time64", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "pselect6", + "pselect6_time64", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readlink", + "readlinkat", + "readv", + "recv", + "recvfrom", + "recvmmsg", + "recvmmsg_time64", + "recvmsg", + "remap_file_pages", + "removexattr", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_sigtimedwait_time64", + "rt_tgsigqueueinfo", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_rr_get_interval_time64", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "semtimedop_time64", + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "set_robust_list", + "setsid", + "setsockopt", + "set_thread_area", + "set_tid_address", + "setuid", + "setuid32", + "setxattr", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaltstack", + "signalfd", + "signalfd4", + "sigprocmask", + "sigreturn", + "socket", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "sysinfo", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timer_getoverrun", + "timer_gettime", + "timer_gettime64", + "timer_settime", + "timer_settime64", + "timerfd_create", + "timerfd_gettime", + "timerfd_gettime64", + "timerfd_settime", + "timerfd_settime64", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "uname", + "unlink", + "unlinkat", + "utime", + "utimensat", + "utimensat_time64", + "utimes", + "vfork", + "vmsplice", + "wait4", + "waitid", + "waitpid", + "write", + "writev" + ], + "action": "SCMP_ACT_ALLOW" + }, + { + "names": [ + "process_vm_readv", + "process_vm_writev", + "ptrace" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "minKernel": "4.8" + } + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 0, + "op": "SCMP_CMP_EQ" + }, + { + "index": 0, + "value": 8, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 131072, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 131080, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "personality" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 4294967295, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "names": [ + "sync_file_range2" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "ppc64le" + ] + } + }, + { + "names": [ + "arm_fadvise64_64", + "arm_sync_file_range", + "sync_file_range2", + "breakpoint", + "cacheflush", + "set_tls" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "arm", + "arm64" + ] + } + }, + { + "names": [ + "arch_prctl" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "amd64", + "x32" + ] + } + }, + { + "names": [ + "modify_ldt" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "amd64", + "x32", + "x86" + ] + } + }, + { + "names": [ + "s390_pci_mmio_read", + "s390_pci_mmio_write", + "s390_runtime_instr" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "arches": [ + "s390", + "s390x" + ] + } + }, + { + "names": [ + "open_by_handle_at" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_DAC_READ_SEARCH" + ] + } + }, + { + "names": [ + "bpf", + "clone", + "clone3", + "fanotify_init", + "fsconfig", + "fsmount", + "fsopen", + "fspick", + "lookup_dcookie", + "mount", + "move_mount", + "name_to_handle_at", + "open_tree", + "perf_event_open", + "quotactl", + "setdomainname", + "sethostname", + "setns", + "syslog", + "umount", + "umount2", + "unshare" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_ADMIN" + ] + } + }, + { + "names": [ + "clone" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 0, + "value": 2114060288, + "op": "SCMP_CMP_MASKED_EQ" + } + ], + "excludes": { + "caps": [ + "CAP_SYS_ADMIN" + ], + "arches": [ + "s390", + "s390x" + ] + } + }, + { + "names": [ + "clone" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 1, + "value": 2114060288, + "op": "SCMP_CMP_MASKED_EQ" + } + ], + "comment": "s390 parameter ordering for clone is different", + "includes": { + "arches": [ + "s390", + "s390x" + ] + }, + "excludes": { + "caps": [ + "CAP_SYS_ADMIN" + ] + } + }, + { + "names": [ + "clone3" + ], + "action": "SCMP_ACT_ERRNO", + "errnoRet": 38, + "excludes": { + "caps": [ + "CAP_SYS_ADMIN" + ] + } + }, + { + "names": [ + "reboot" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_BOOT" + ] + } + }, + { + "names": [ + "chroot" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_CHROOT" + ] + } + }, + { + "names": [ + "delete_module", + "init_module", + "finit_module" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_MODULE" + ] + } + }, + { + "names": [ + "acct" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_PACCT" + ] + } + }, + { + "names": [ + "kcmp", + "pidfd_getfd", + "process_madvise", + "process_vm_readv", + "process_vm_writev", + "ptrace" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_PTRACE" + ] + } + }, + { + "names": [ + "iopl", + "ioperm" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_RAWIO" + ] + } + }, + { + "names": [ + "settimeofday", + "stime", + "clock_settime" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_TIME" + ] + } + }, + { + "names": [ + "vhangup" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_TTY_CONFIG" + ] + } + }, + { + "names": [ + "get_mempolicy", + "mbind", + "set_mempolicy" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYS_NICE" + ] + } + }, + { + "names": [ + "syslog" + ], + "action": "SCMP_ACT_ALLOW", + "includes": { + "caps": [ + "CAP_SYSLOG" + ] + } + } + ] + }, + "resources": { + "devices": [ + { + "allow": true, + "access": "rwm" + } + ] + }, + "uidMappings": [ + { + "containerID": 0, + "hostID": 1000, + "size": 1 + } + ], + "gidMappings": [ + { + "containerID": 0, + "hostID": 1000, + "size": 1 + } + ], + "namespaces": [ + { + "type": "pid" + }, + { + "type": "network" + }, + { + "type": "user" + }, + { + "type": "ipc" + }, + { + "type": "uts" + }, + { + "type": "mount" + } + ], + "maskedPaths": [ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi" + ], + "readonlyPaths": [ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + } +} \ No newline at end of file diff --git a/src/seccomp/mod.rs b/src/seccomp/mod.rs new file mode 100644 index 000000000..c7690534c --- /dev/null +++ b/src/seccomp/mod.rs @@ -0,0 +1,420 @@ +use anyhow::bail; +use anyhow::Context; +use anyhow::Result; +use oci_spec::runtime::Arch; +use oci_spec::runtime::LinuxSeccomp; +use oci_spec::runtime::LinuxSeccompAction; +use oci_spec::runtime::LinuxSeccompOperator; +use seccomp::scmp_compare::*; +use seccomp::*; +use std::ffi::CString; + +#[derive(Debug)] +struct Compare { + // The zero-indexed index of the syscall arguement. + arg: libc::c_uint, + op: Option, + datum_a: Option, + datum_b: Option, +} + +impl Compare { + pub fn new(args: u32) -> Self { + Compare { + arg: args as libc::c_uint, + op: None, + datum_a: None, + datum_b: None, + } + } + + pub fn op(mut self, op: scmp_compare) -> Self { + self.op = Some(op); + + self + } + + pub fn datum_a(mut self, datum: scmp_datum_t) -> Self { + self.datum_a = Some(datum); + + self + } + + pub fn datum_b(mut self, datum: scmp_datum_t) -> Self { + self.datum_b = Some(datum); + + self + } + + pub fn build(self) -> Result { + if let (Some(op), Some(datum_a)) = (self.op, self.datum_a) { + Ok(scmp_arg_cmp { + arg: self.arg, + op, + datum_a, + // datum_b is optional for a number of op, since these op only + // requires one value. For example, the SCMP_OP_EQ or equal op + // requires only one value. We set the datum_b to 0 in the case + // that only one value is required. + datum_b: self.datum_b.unwrap_or(0), + }) + } else { + bail!("op and datum_a is required: {:?}", self); + } + } +} + +#[derive(Debug)] +struct Rule { + action: u32, + syscall_nr: i32, + comparators: Vec, +} + +impl Rule { + pub fn new(action: u32, syscall_number: i32) -> Self { + Rule { + action, + syscall_nr: syscall_number, + comparators: vec![], + } + } + + pub fn add_comparator(&mut self, cmp: scmp_arg_cmp) { + self.comparators.push(cmp); + } +} + +#[derive(Debug)] +struct FilterContext { + ctx: scmp_filter_ctx, +} + +impl FilterContext { + pub fn default(default_action: u32) -> Result { + let filter_ctx = unsafe { seccomp_init(default_action) }; + if filter_ctx.is_null() { + bail!("Failed to initialized seccomp profile") + } + + Ok(FilterContext { ctx: filter_ctx }) + } + + pub fn add_rule(&mut self, rule: &Rule) -> Result<()> { + let res = match rule.comparators.len() { + 0 => unsafe { seccomp_rule_add(self.ctx, rule.action, rule.syscall_nr, 0) }, + _ => unsafe { + seccomp_rule_add_array( + self.ctx, + rule.action, + rule.syscall_nr, + rule.comparators.len() as u32, + rule.comparators.as_slice().as_ptr(), + ) + }, + }; + if res != 0 { + bail!("Failed to add rule. Errno: {}, Rule: {:?}", res, rule); + } + + Ok(()) + } + + pub fn add_arch(&mut self, arch: u32) -> Result<()> { + let res = unsafe { seccomp_arch_add(self.ctx, arch) }; + if res != 0 && nix::Error::from_i32(res.abs()) != nix::Error::EEXIST { + // The architecture already existed in the profile, so we can + // safely ignore the error here. Otherwise, error out. + bail!("Failed to add architecture {}. Errno: {}", arch, res); + } + + Ok(()) + } + + pub fn load(&self) -> Result<()> { + let res = unsafe { seccomp_load(self.ctx) }; + if res != 0 { + bail!("Failed to load seccomp profile: {}", res); + } + + Ok(()) + } +} + +fn translate_syscall(syscall_name: &str) -> Result { + let c_syscall_name = CString::new(syscall_name) + .with_context(|| format!("Failed to convert syscall {:?} to cstring", syscall_name))?; + let res = unsafe { seccomp_syscall_resolve_name(c_syscall_name.as_ptr()) }; + if res == __NR_SCMP_ERROR { + bail!("Failed to resolve syscall from name: {:?}", syscall_name); + } + + Ok(res) +} + +fn translate_action(action: &LinuxSeccompAction, errno: Option) -> u32 { + let errno = errno.unwrap_or(libc::EPERM as u32); + match action { + LinuxSeccompAction::ScmpActKill => SCMP_ACT_KILL, + LinuxSeccompAction::ScmpActTrap => SCMP_ACT_TRAP, + LinuxSeccompAction::ScmpActErrno => SCMP_ACT_ERRNO(errno), + LinuxSeccompAction::ScmpActTrace => SCMP_ACT_TRACE(errno), + LinuxSeccompAction::ScmpActAllow => SCMP_ACT_ALLOW, + LinuxSeccompAction::ScmpActKillProcess => SCMP_ACT_KILL_PROCESS, + LinuxSeccompAction::ScmpActNotify => SCMP_ACT_NOTIFY, + LinuxSeccompAction::ScmpActLog => SCMP_ACT_LOG, + } +} + +fn translate_op(op: &LinuxSeccompOperator) -> scmp_compare { + match op { + LinuxSeccompOperator::ScmpCmpNe => SCMP_CMP_NE, + LinuxSeccompOperator::ScmpCmpLt => SCMP_CMP_LT, + LinuxSeccompOperator::ScmpCmpLe => SCMP_CMP_LE, + LinuxSeccompOperator::ScmpCmpEq => SCMP_CMP_EQ, + LinuxSeccompOperator::ScmpCmpGe => SCMP_CMP_GE, + LinuxSeccompOperator::ScmpCmpGt => SCMP_CMP_GT, + LinuxSeccompOperator::ScmpCmpMaskedEq => SCMP_CMP_MASKED_EQ, + } +} + +fn translate_arch(arch: &Arch) -> scmp_arch { + match arch { + Arch::ScmpArchNative => SCMP_ARCH_NATIVE, + Arch::ScmpArchX86 => SCMP_ARCH_X86, + Arch::ScmpArchX86_64 => SCMP_ARCH_X86_64, + Arch::ScmpArchX32 => SCMP_ARCH_X32, + Arch::ScmpArchArm => SCMP_ARCH_ARM, + Arch::ScmpArchAarch64 => SCMP_ARCH_AARCH64, + Arch::ScmpArchMips => SCMP_ARCH_MIPS, + Arch::ScmpArchMips64 => SCMP_ARCH_MIPS64, + Arch::ScmpArchMips64n32 => SCMP_ARCH_MIPS64N32, + Arch::ScmpArchMipsel => SCMP_ARCH_MIPSEL, + Arch::ScmpArchMipsel64 => SCMP_ARCH_MIPSEL64, + Arch::ScmpArchMipsel64n32 => SCMP_ARCH_MIPSEL64N32, + Arch::ScmpArchPpc => SCMP_ARCH_PPC, + Arch::ScmpArchPpc64 => SCMP_ARCH_PPC64, + Arch::ScmpArchPpc64le => SCMP_ARCH_PPC64LE, + Arch::ScmpArchS390 => SCMP_ARCH_S390, + Arch::ScmpArchS390x => SCMP_ARCH_S390X, + } +} + +pub fn initialize_seccomp(seccomp: &LinuxSeccomp) -> Result<()> { + if seccomp.flags.is_some() { + // runc did not support this, so let's skip it for now. + bail!("seccomp flags are not yet supported"); + } + + // TODO: fix default action error number. The spec repo doesn't have it yet. + let default_action = translate_action(&seccomp.default_action, None); + let mut ctx = FilterContext::default(default_action)?; + + if let Some(architectures) = seccomp.architectures.as_ref() { + for arch in architectures { + let arch_token = translate_arch(arch); + ctx.add_arch(arch_token as u32) + .context("Failed to add arch to seccomp")?; + } + } + + // The SCMP_FLTATR_CTL_NNP controls if the seccomp load function will set + // the new privilege bit automatically in prctl. Normally this is a good + // thing, but for us we need better control. Based on the spec, if OCI + // runtime spec doesn't set the no new privileges in Process, we should not + // set it here. If the seccomp load operation fails without enough + // privilege, so be it. To prevent this automatic behavior, we unset the + // value here. + let ret = unsafe { seccomp_attr_set(ctx.ctx, scmp_filter_attr::SCMP_FLTATR_CTL_NNP, 0) }; + if ret != 0 { + bail!( + "Failed to unset the no new privileges bit for seccomp: {}", + ret + ); + } + + if let Some(syscalls) = seccomp.syscalls.as_ref() { + for syscall in syscalls { + let action = translate_action(&syscall.action, syscall.errno_ret); + if action == default_action { + // When the action is the same as the default action, the rule is redundent. We can + // skip this here to avoid failing when we add the rules. + log::warn!( + "Detect a seccomp action that is the same as the default action: {:?}", + syscall + ); + continue; + } + + for name in &syscall.names { + let syscall_number = match translate_syscall(name) { + Ok(x) => x, + Err(_) => { + // If we failed to resolve the syscall by name, likely the kernel + // doeesn't support this syscall. So it is safe to skip... + log::warn!( + "Failed to resolve syscall, likely kernel doesn't support this. {:?}", + name + ); + continue; + } + }; + // Not clear why but if there are multiple arg attached to one + // syscall rule, we have to add them seperatly. add_rule will + // return EINVAL. runc does the same but doesn't explain why. + match syscall.args.as_ref() { + Some(args) => { + for arg in args { + let mut rule = Rule::new(action, syscall_number); + let cmp = Compare::new(arg.index as u32) + .op(translate_op(&arg.op)) + .datum_a(arg.value) + .datum_b(arg.value_two.unwrap_or(0)) + .build() + .context("Failed to build a seccomp compare rule")?; + rule.add_comparator(cmp); + ctx.add_rule(&rule).with_context(|| { + format!( + "Failed to add seccomp rule: {:?}. Syscall: {:?}", + &rule, name, + ) + })?; + } + } + None => { + let rule = Rule::new(action, syscall_number); + ctx.add_rule(&rule).with_context(|| { + format!( + "Failed to add seccomp rule: {:?}. Syscall: {:?}", + &rule, name, + ) + })?; + } + } + } + } + } + + // In order to use the SECCOMP_SET_MODE_FILTER operation, either the calling + // thread must have the CAP_SYS_ADMIN capability in its user namespace, or + // the thread must already have the no_new_privs bit set. + // Ref: https://man7.org/linux/man-pages/man2/seccomp.2.html + ctx.load().context("Failed to load seccomp context")?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use anyhow::Result; + use mio::unix::pipe; + use nix::sys::wait; + use oci_spec::runtime::{Arch, LinuxSeccomp, LinuxSyscall}; + use serial_test::serial; + use std::io::Read; + use std::io::Write; + use std::os::unix::prelude::AsRawFd; + use std::path; + + #[test] + #[serial] + fn test_basic() -> Result<()> { + // Note: seccomp profile is really hard to write unit test for. First, + // we can't really test default error or kill action, since rust test + // actually relies on certain syscalls. Second, some of the syscall will + // not return errorno. These syscalls will just send an abort signal or + // even just segfaults. Here we choose to use `getcwd` syscall for + // testing, since it will correctly return an error under seccomp rule. + // This is more of a sanity check. + + // Here, we choose an error that getcwd call would never return on its own, so + // we can make sure that getcwd failed because of seccomp rule. + let expect_error = libc::EAGAIN; + + let seccomp_profile = LinuxSeccomp { + default_action: LinuxSeccompAction::ScmpActAllow, + architectures: Some(vec![Arch::ScmpArchNative]), + flags: None, + syscalls: Some(vec![LinuxSyscall { + names: vec![String::from("getcwd")], + action: LinuxSeccompAction::ScmpActErrno, + errno_ret: Some(expect_error as u32), + args: None, + }]), + }; + + // Since Rust cargo test uses a single process to execute all tests, it + // is a good idea to fork a child process to test the seccomp profile, + // and then kill the process. This way, the main test process is + // unaffected. The child process will pass the returned error code + // to the parent for assert and checking. + let (mut sender, mut receiver) = pipe::new()?; + receiver + .set_nonblocking(false) + .with_context(|| "Failed to set channel receiver to blocking")?; + + match unsafe { nix::unistd::fork()? } { + nix::unistd::ForkResult::Parent { child } => { + nix::unistd::close(sender.as_raw_fd())?; + let mut buf = [0; 4]; + receiver + .read_exact(&mut buf) + .context("Failed to wait from child")?; + assert_eq!(i32::from_be_bytes(buf), expect_error); + wait::waitpid(child, None)?; + } + nix::unistd::ForkResult::Child => { + nix::unistd::close(receiver.as_raw_fd())?; + let _ = prctl::set_no_new_privileges(true); + initialize_seccomp(&seccomp_profile)?; + let ret = nix::unistd::getcwd(); + let errno: i32 = if ret.is_err() { + ret.err().unwrap() as i32 + } else { + 0 + }; + sender.write_all(&errno.to_be_bytes())?; + std::process::exit(errno); + } + } + + Ok(()) + } + + #[test] + #[serial] + fn test_moby() -> Result<()> { + let fixture_path = + path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/seccomp/fixture/config.json"); + let spec = oci_spec::runtime::Spec::load(fixture_path) + .context("Failed to load test spec for seccomp")?; + + // We know linux and seccomp exist, so let's just unwrap. + let seccomp_profile = spec.linux.unwrap().seccomp.unwrap(); + match unsafe { nix::unistd::fork()? } { + nix::unistd::ForkResult::Parent { child } => { + let status = wait::waitpid(child, None)?; + match status { + wait::WaitStatus::Exited(_, exit_code) => { + assert_eq!( + exit_code, 0, + "Child process didn't configure seccomp profile correctly" + ); + } + _ => { + bail!("Child process failed to exit correctly: {:?}", status); + } + } + } + nix::unistd::ForkResult::Child => { + let _ = prctl::set_no_new_privileges(true); + let ret = initialize_seccomp(&seccomp_profile); + let exit_code = if ret.is_ok() { 0 } else { -1 }; + std::process::exit(exit_code); + } + } + + Ok(()) + } +}