From 56f2530dad18b3fa0cf1f7e8489f6d7e0f27e1ff Mon Sep 17 00:00:00 2001 From: Anqi Shen Date: Wed, 11 Oct 2023 14:44:29 +0800 Subject: [PATCH] Support plugin network stack This commit supports a third-party network stack as a plugin stack for gVisor. The overall plugin package structure is the following: - pkg/sentry/socket/plugin: Interfaces for initializing plugin network stack. It will be used in network setting up during sandbox creating. - pkg/sentry/socket/plugin/stack: Glue layer for plugin stack's socket and stack ops with sentry. It will also register plugin stack operations if imported. - pkg/sentry/socket/plugin/cgo: Interfaces defined in C for plugin network stack to support. To build target runsc-plugin-stack, which imports pkg/sentry/socket/plugin/stack package and enables CGO: bazel build --config=plugin-tldk runsc:runsc-plugin-stack (i.e. --config=plugin-tldk indicates that using TLDK as plugin stack) By using runsc-plugin-stack binary and setting "--network=plugin" in runtimeArgs, user can use third-party network stack instead of netstack embedded in gVisor to get better network performance. Redis benchmark with following setups: 1. KVM platform 2. 4 physical cores for target pod 3. target pod as redis server Runc: $redis-benchmark -h [target ip] -n 100000 -t get,set -q SET: 115207.38 requests per second, p50=0.215 msec GET: 92336.11 requests per second, p50=0.279 msec $redis-benchmark -h [target ip] -n 100000 -t get,set -q SET: 113895.21 requests per second, p50=0.247 msec GET: 96899.23 requests per second, p50=0.271 msec $redis-benchmark -h [target ip] -n 100000 -t get,set -q SET: 126582.27 requests per second, p50=0.199 msec GET: 95969.28 requests per second, p50=0.271 msec Runsc with plugin stack: $redis-benchmark -h [target ip] -n 100000 -t get,set -q SET: 123915.74 requests per second, p50=0.343 msec GET: 115473.45 requests per second, p50=0.335 msec $redis-benchmark -h [target ip] -n 100000 -t get,set -q SET: 120918.98 requests per second, p50=0.351 msec GET: 117647.05 requests per second, p50=0.351 msec $redis-benchmark -h [target ip] -n 100000 -t get,set -q SET: 119904.08 requests per second, p50=0.367 msec GET: 112739.57 requests per second, p50=0.375 msec Runsc with netstack: $redis-benchmark -h [target ip] -n 100000 -t get,set -q SET: 59952.04 requests per second, p50=0.759 msec GET: 61162.08 requests per second, p50=0.631 msec $redis-benchmark -h [target ip] -n 100000 -t get,set -q SET: 52219.32 requests per second, p50=0.719 msec GET: 58719.91 requests per second, p50=0.663 msec $redis-benchmark -h [target ip] -n 100000 -t get,set -q SET: 59952.04 requests per second, p50=0.751 msec GET: 60827.25 requests per second, p50=0.751 msec Updates https://github.com/google/gvisor/issues/9266 Co-developed-by: Tianyu Zhou Signed-off-by: Anqi Shen --- .bazelrc | 3 + WORKSPACE | 7 + pkg/abi/linux/netdevice.go | 3 + pkg/sentry/socket/plugin/BUILD | 18 + pkg/sentry/socket/plugin/cgo/BUILD | 29 + pkg/sentry/socket/plugin/cgo/socket_unsafe.go | 419 +++++++++ pkg/sentry/socket/plugin/cgo/stack_unsafe.go | 82 ++ pkg/sentry/socket/plugin/cgo/util_unsafe.go | 36 + pkg/sentry/socket/plugin/config.go | 36 + pkg/sentry/socket/plugin/plugin.go | 101 +++ pkg/sentry/socket/plugin/stack/BUILD | 45 + pkg/sentry/socket/plugin/stack/notifier.go | 165 ++++ pkg/sentry/socket/plugin/stack/provider.go | 105 +++ pkg/sentry/socket/plugin/stack/readwriter.go | 110 +++ pkg/sentry/socket/plugin/stack/socket.go | 854 ++++++++++++++++++ pkg/sentry/socket/plugin/stack/stack.go | 86 ++ pkg/sentry/socket/plugin/stack/util.go | 91 ++ pkg/usermem/BUILD | 1 + pkg/usermem/usermem.go | 46 + runsc/BUILD | 17 + runsc/boot/BUILD | 1 + runsc/boot/controller.go | 9 + runsc/boot/filter/config/BUILD | 2 + runsc/boot/filter/config/config.go | 17 + runsc/boot/filter/config/config_cgo.go | 57 ++ runsc/boot/loader.go | 5 + runsc/boot/network.go | 38 + runsc/cmd/BUILD | 1 + runsc/cmd/gofer.go | 1 + runsc/config/BUILD | 2 + runsc/config/cgo_disabled.go | 20 + runsc/config/cgo_enabled.go | 20 + runsc/config/config.go | 7 + runsc/fsgofer/filter/BUILD | 2 + runsc/fsgofer/filter/config_cgo.go | 48 + runsc/fsgofer/filter/filter.go | 6 + runsc/main_plugin_stack.go | 34 + runsc/sandbox/BUILD | 1 + runsc/sandbox/network.go | 29 + runsc/sandbox/sandbox.go | 4 + tools/plugin-stack/plugin-stack.BUILD | 22 + tools/rules_cgo.patch | 17 + 42 files changed, 2597 insertions(+) create mode 100644 pkg/sentry/socket/plugin/BUILD create mode 100644 pkg/sentry/socket/plugin/cgo/BUILD create mode 100644 pkg/sentry/socket/plugin/cgo/socket_unsafe.go create mode 100644 pkg/sentry/socket/plugin/cgo/stack_unsafe.go create mode 100644 pkg/sentry/socket/plugin/cgo/util_unsafe.go create mode 100644 pkg/sentry/socket/plugin/config.go create mode 100644 pkg/sentry/socket/plugin/plugin.go create mode 100644 pkg/sentry/socket/plugin/stack/BUILD create mode 100644 pkg/sentry/socket/plugin/stack/notifier.go create mode 100644 pkg/sentry/socket/plugin/stack/provider.go create mode 100644 pkg/sentry/socket/plugin/stack/readwriter.go create mode 100644 pkg/sentry/socket/plugin/stack/socket.go create mode 100644 pkg/sentry/socket/plugin/stack/stack.go create mode 100644 pkg/sentry/socket/plugin/stack/util.go create mode 100644 runsc/boot/filter/config/config_cgo.go create mode 100644 runsc/config/cgo_disabled.go create mode 100644 runsc/config/cgo_enabled.go create mode 100644 runsc/fsgofer/filter/config_cgo.go create mode 100644 runsc/main_plugin_stack.go create mode 100644 tools/plugin-stack/plugin-stack.BUILD create mode 100644 tools/rules_cgo.patch diff --git a/.bazelrc b/.bazelrc index 272ba7e40e..ed620a2db4 100644 --- a/.bazelrc +++ b/.bazelrc @@ -29,6 +29,9 @@ test:race --@io_bazel_rules_go//go/config:race --@io_bazel_rules_go//go/config:p build --@io_bazel_rules_go//go/config:pure test --@io_bazel_rules_go//go/config:pure +# Set bazel_rule as non-pure when cgo is used. +build:plugin-tldk --@io_bazel_rules_go//go/config:pure=false --define=plugin_tldk=true + # By default, exclude nogo targets from building. They will still be included # by default for all tests. build --build_tag_filters=-nogo diff --git a/WORKSPACE b/WORKSPACE index 96b10aac58..df1aad2fd1 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -52,6 +52,7 @@ http_archive( # Allow for patching of the go_sdk. "//tools:rules_go_sdk.patch", "//tools:rules_go_facts.patch", + "//tools:rules_cgo.patch", ], sha256 = "80a98277ad1311dacd837f9b16db62887702e9f1d1c4c9f796d0121a46c8e184", urls = [ @@ -3301,3 +3302,9 @@ go_repository( sum = "h1:uImZAk6qLkC6F9ju6mZ5SPBqTyK8xjZKwSmwnCg4bxg=", version = "v2.3.3", ) + +new_local_repository( + name = "libpluginstack", + path = "tools/plugin-stack", + build_file = "tools/plugin-stack/plugin-stack.BUILD", +) diff --git a/pkg/abi/linux/netdevice.go b/pkg/abi/linux/netdevice.go index 0c30c73f86..924bd88532 100644 --- a/pkg/abi/linux/netdevice.go +++ b/pkg/abi/linux/netdevice.go @@ -85,6 +85,9 @@ type IFConf struct { Ptr uint64 } +// SizeOfIFConf is the binary size of an IFConf struct (16 bytes). +var SizeOfIFConf = (*IFConf)(nil).SizeBytes() + // EthtoolCmd is a marshallable type to be able to easily copyin the // the command for an SIOCETHTOOL ioctl. // diff --git a/pkg/sentry/socket/plugin/BUILD b/pkg/sentry/socket/plugin/BUILD new file mode 100644 index 0000000000..22cb6445d5 --- /dev/null +++ b/pkg/sentry/socket/plugin/BUILD @@ -0,0 +1,18 @@ +package(licenses = ["notice"]) + +load("//tools:defs.bzl", "go_library") + +go_library( + name = "plugin", + srcs = [ + "config.go", + "plugin.go", + ], + visibility = ["//visibility:public"], + deps = [ + "//pkg/seccomp", + "//pkg/sentry/inet", + "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/sentry/socket/plugin/cgo/BUILD b/pkg/sentry/socket/plugin/cgo/BUILD new file mode 100644 index 0000000000..0589921a07 --- /dev/null +++ b/pkg/sentry/socket/plugin/cgo/BUILD @@ -0,0 +1,29 @@ +package(licenses = ["notice"]) + +load("//tools:defs.bzl", "go_library") + +go_library( + name = "cgo", + srcs = [ + "socket_unsafe.go", + "stack_unsafe.go", + "util_unsafe.go", + ], + cgo = True, + visibility = ["//visibility:public"], + deps = [ + "//pkg/abi/linux", + "//pkg/abi/linux/errno", + "//runsc/config", + ], + cdeps = [ + "@libpluginstack//:libpluginstack", + ], + copts = [ + "-march=native", + "-I external/libpluginstack/lib/libtle_glue", + ], + clinkopts = [ + "-L external/libpluginstack", + ], +) diff --git a/pkg/sentry/socket/plugin/cgo/socket_unsafe.go b/pkg/sentry/socket/plugin/cgo/socket_unsafe.go new file mode 100644 index 0000000000..1f0a1972df --- /dev/null +++ b/pkg/sentry/socket/plugin/cgo/socket_unsafe.go @@ -0,0 +1,419 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cgo + +/* +#include +#include +#include + +// socket event-related operations +int plugin_epoll_create(void); +int plugin_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event); +int plugin_epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout); + +// socket control-path operations +int plugin_socket(int domain, int type, int protocol, uint64_t *err); +int plugin_listen(int sockfd, int backlog, uint64_t *err); +int plugin_bind(int sockfd, const struct sockaddr *addr, socklen_t addrlen, uint64_t *err); +int plugin_accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen, uint64_t *err); +int plugin_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, uint64_t *err); +int plugin_getsockopt(int sockfd, int level, int optname, + void *optval, socklen_t *optlen, uint64_t *err); +int plugin_setsockopt(int sockfd, int level, int optname, + const void *optval, socklen_t optlen, uint64_t *err); +int plugin_getsockname(int sockfd, struct sockaddr *addr, socklen_t *addrlen, uint64_t *err); +int plugin_getpeername(int sockfd, struct sockaddr *addr, socklen_t *addrlen, uint64_t *err); +int plugin_ioctl(int fd, uint64_t *err, unsigned long int request, void *buf); +int plugin_shutdown(int sockfd, int how, uint64_t *err); +int plugin_close(int fd); +int plugin_readiness(int fd, int events); + +// socket data-path (ingress) operations +ssize_t plugin_recv(int sockfd, void *buf, size_t len, int flags, uint64_t *err); +ssize_t plugin_recvfrom(int sockfd, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen, uint64_t *err); +ssize_t plugin_recvmsg(int sockfd, struct msghdr *msg, int flags, uint64_t *err); +ssize_t plugin_read(int fd, void *buf, size_t count, uint64_t *err); +ssize_t plugin_readv(int fd, const struct iovec *iov, int iovcnt, uint64_t *err); + +// socket data-path (egress) operations +ssize_t plugin_send(int sockfd, const void *buf, size_t len, int flags, uint64_t *err); +ssize_t plugin_sendto(int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen, uint64_t *err); +ssize_t plugin_sendmsg(int sockfd, const struct msghdr *msg, int flags, uint64_t *err); +ssize_t plugin_write(int fd, const void *buf, size_t count, uint64_t *err); +ssize_t plugin_writev(int fd, const struct iovec *iov, int iovcnt, uint64_t *err); +*/ +import "C" +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" + linuxerrno "gvisor.dev/gvisor/pkg/abi/linux/errno" +) + +// EpollCreate works as a CGO wrapper for plugin_epoll_create. +func EpollCreate() int { + return int(C.plugin_epoll_create()) +} + +// EpollCtl works as a CGO wrapper for plugin_epoll_ctl. +func EpollCtl(epfd int32, op int, handle, events uint32) { + epollEvent := syscall.EpollEvent{ + Events: events, + Fd: int32(handle), + } + C.plugin_epoll_ctl( + C.int(epfd), + C.int(op), + C.int(handle), + (*C.struct_epoll_event)(unsafe.Pointer(&epollEvent))) +} + +// EpollWait works as a CGO wrapper for plugin_epoll_wait. +func EpollWait(epfd int32, events []syscall.EpollEvent, n int, us int) int { + if len(events) == 0 { + return 0 + } + return int(C.plugin_epoll_wait( + C.int(epfd), + (*C.struct_epoll_event)(unsafe.Pointer(&events[0])), + C.int(n), + C.int(us))) +} + +// Socket works as a CGO wrapper for plugin_socket. +// Note: This function will set socket as non-blocking. +func Socket(domain, skType, protocol int) int64 { + var errno uint64 + if fd := int64(C.plugin_socket( + C.int(domain), + C.int(skType), + C.int(protocol), + (*C.uint64_t)(unsafe.Pointer(&errno)))); fd < 0 { + return -int64(errno) + } else { + nonblock := 1 + C.plugin_ioctl( + C.int(fd), + (*C.uint64_t)(unsafe.Pointer(&errno)), + C.uint64_t(linux.FIONBIO), + unsafe.Pointer(&nonblock)) + return fd + } +} + +// Bind works as a CGO wrapper for plugin_bind. +func Bind(handle uint32, sa []byte) int64 { + var errno uint64 + return convertRetVal( + int64(C.plugin_bind( + C.int(handle), + (*C.struct_sockaddr)(GetPtr(sa)), + C.uint(len(sa)), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// Listen works as a CGO wrapper for plugin_listen. +func Listen(handle uint32, backlog int) int64 { + var errno uint64 + return convertRetVal( + int64(C.plugin_listen( + C.int(handle), + C.int(backlog), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// Accept works as a CGO wrapper for plugin_accept. +// Note: This function will set socket as non-blocking. +func Accept(handle uint32, addrPtr *byte, lenPtr *uint32) int64 { + var errno uint64 + if fd := int64(C.plugin_accept( + C.int(handle), + (*C.struct_sockaddr)(unsafe.Pointer(addrPtr)), + (*C.socklen_t)(unsafe.Pointer(lenPtr)), + (*C.uint64_t)(unsafe.Pointer(&errno)))); fd < 0 { + return -int64(errno) + } else { + nonblock := 1 + C.plugin_ioctl( + C.int(fd), + (*C.uint64_t)(unsafe.Pointer(&errno)), + C.uint64_t(linux.FIONBIO), + unsafe.Pointer(&nonblock)) + return fd + } +} + +// Ioctl works as a CGO wrapper for plugin_ioctl. +func Ioctl(handle uint32, cmd uint32, buf []byte) int64 { + var errno uint64 + return convertRetVal( + int64(C.plugin_ioctl( + C.int(handle), + (*C.uint64_t)(unsafe.Pointer(&errno)), + C.uint64_t(cmd), + GetPtr(buf))), + errno) +} + +// Connect works as a CGO wrapper for plugin_connect. +func Connect(handle uint32, addr []byte) int64 { + var errno uint64 + return convertRetVal( + int64(C.plugin_connect( + C.int(handle), + (*C.struct_sockaddr)(GetPtr(addr)), + C.socklen_t(len(addr)), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// Getsockopt works as a CGO wrapper for plugin_getsockopt. +func Getsockopt(handle uint32, l int, n int, val []byte, s int) (int64, int) { + var errno uint64 + if ret := int64(C.plugin_getsockopt( + C.int(handle), + C.int(l), + C.int(n), + GetPtr(val), + (*C.uint)(unsafe.Pointer(&s)), + (*C.uint64_t)(unsafe.Pointer(&errno)))); ret < 0 { + return -int64(errno), s + } else { + return ret, s + } +} + +// Setsockopt works as a CGO wrapper for plugin_setsockopt. +func Setsockopt(handle uint32, l int, n int, val []byte) int64 { + var errno uint64 + return convertRetVal( + int64(C.plugin_setsockopt( + C.int(handle), + C.int(l), + C.int(n), + GetPtr(val), + C.uint(len(val)), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// Shutdown works as a CGO wrapper for plugin_shutdown. +func Shutdown(handle uint32, how int) int64 { + var errno uint64 + return convertRetVal( + int64(C.plugin_shutdown( + C.int(handle), + C.int(how), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// Close works as a CGO wrapper for plugin_close. +func Close(handle uint32) { + C.plugin_close(C.int(handle)) +} + +// Getsockname works as a CGO wrapper for plugin_getsockname. +func Getsockname(handle uint32, addr []byte, addrlen *uint32) int64 { + var errno uint64 + if len(addr) == 0 { + return -linuxerrno.EINVAL + } + return convertRetVal( + int64(C.plugin_getsockname( + C.int(handle), + (*C.struct_sockaddr)(unsafe.Pointer(&addr[0])), + (*C.socklen_t)(unsafe.Pointer(addrlen)), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// GetPeername works as a CGO wrapper for plugin_getpeername. +func GetPeername(handle uint32, addr []byte, addrlen *uint32) int64 { + var errno uint64 + if len(addr) == 0 { + return -linuxerrno.EINVAL + } + return convertRetVal( + int64(C.plugin_getpeername( + C.int(handle), + (*C.struct_sockaddr)(unsafe.Pointer(&addr[0])), + (*C.socklen_t)(unsafe.Pointer(addrlen)), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// Readiness works as a CGO wrapper for plugin_readiness. +func Readiness(handle uint32, mask uint64) int64 { + return int64(C.plugin_readiness(C.int(handle), C.int(mask))) +} + +// Read works as a CGO wrapper for plugin_read. +func Read(handle uint32, buf uintptr, count int) int64 { + var errno uint64 + return convertRetVal( + int64(C.plugin_read( + C.int(handle), + unsafe.Pointer(buf), + C.size_t(count), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// Readv works as a CGO wrapper for plugin_readv. +func Readv(handle uint32, iovs []syscall.Iovec) int64 { + var errno uint64 + if len(iovs) == 0 { + return 0 + } + return convertRetVal( + int64(C.plugin_readv( + C.int(handle), + (*C.struct_iovec)(unsafe.Pointer(&iovs[0])), + C.int(len(iovs)), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// Recvfrom works as a CGO wrapper for plugin_recvfrom. +func Recvfrom(handle uint32, buf, addr []byte, flags int) (int64, int) { + var errno uint64 + addrlen := len(addr) + if ret := int64(C.plugin_recvfrom( + C.int(handle), + GetPtr(buf), + C.size_t(len(buf)), + C.int(flags), + (*C.struct_sockaddr)(GetPtr(addr)), + (*C.socklen_t)(unsafe.Pointer(&addrlen)), + (*C.uint64_t)(unsafe.Pointer(&errno)))); ret < 0 { + return -int64(errno), addrlen + } else { + return ret, addrlen + } +} + +// Recvmsg works as a CGO wrapper for plugin_recvmsg. +func Recvmsg(handle uint32, iovs []syscall.Iovec, addr, control []byte, flags int) (int64, int, int, int) { + lenAddr := len(addr) + lenCtl := len(control) + sysflags := flags | syscall.MSG_DONTWAIT + + if len(iovs) == 0 { + return 0, lenAddr, lenCtl, 0 + } + + var ptrAddr, ptrCtl *byte + if lenAddr > 0 { + ptrAddr = &addr[0] + } + + if lenCtl > 0 { + ptrCtl = &control[0] + } + + msg := syscall.Msghdr{ + Iov: &iovs[0], + Iovlen: uint64(len(iovs)), + Name: ptrAddr, + Namelen: uint32(lenAddr), + Control: ptrCtl, + Controllen: uint64(lenCtl), + } + + var errno uint64 + if ret := int64(C.plugin_recvmsg( + C.int(handle), + (*C.struct_msghdr)(unsafe.Pointer(&msg)), + C.int(sysflags), + (*C.uint64_t)(unsafe.Pointer(&errno)))); ret < 0 { + return -int64(errno), lenAddr, lenCtl, 0 + } else { + return ret, int(msg.Namelen), int(msg.Controllen), int(msg.Flags) + } +} + +// Write works as a CGO wrapper for plugin_write. +func Write(handle uint32, buf uintptr, count int) int64 { + var errno uint64 + return convertRetVal( + int64(C.plugin_write( + C.int(handle), + unsafe.Pointer(buf), + C.size_t(count), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// Writev works as a CGO wrapper for plugin_writev. +func Writev(handle uint32, iovs []syscall.Iovec) int64 { + var errno uint64 + if len(iovs) == 0 { + return 0 + } + return convertRetVal( + int64(C.plugin_writev( + C.int(handle), + (*C.struct_iovec)(unsafe.Pointer(&iovs[0])), + C.int(len(iovs)), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// Sendto works as a CGO wrapper for plugin_sendto. +func Sendto(handle uint32, buf uintptr, count int, flags int, addr []byte) int64 { + var errno uint64 + return convertRetVal( + int64(C.plugin_sendto( + C.int(handle), + unsafe.Pointer(buf), + C.size_t(count), + C.int(flags), + (*C.struct_sockaddr)(GetPtr(addr)), + C.socklen_t(len(addr)), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} + +// Sendmsg works as a CGO wrapper for plugin_sendmsg. +func Sendmsg(handle uint32, iovs []syscall.Iovec, addr []byte, flags int) int64 { + var errno uint64 + if len(iovs) == 0 { + return 0 + } + if len(addr) == 0 { + return -linuxerrno.EINVAL + } + + msg := syscall.Msghdr{ + Iov: &iovs[0], + Iovlen: uint64(len(iovs)), + Name: &addr[0], + Namelen: uint32(len(addr)), + } + return convertRetVal( + int64(C.plugin_sendmsg( + C.int(handle), + (*C.struct_msghdr)(unsafe.Pointer(&msg)), + C.int(flags), + (*C.uint64_t)(unsafe.Pointer(&errno)))), + errno) +} diff --git a/pkg/sentry/socket/plugin/cgo/stack_unsafe.go b/pkg/sentry/socket/plugin/cgo/stack_unsafe.go new file mode 100644 index 0000000000..169a7bea1f --- /dev/null +++ b/pkg/sentry/socket/plugin/cgo/stack_unsafe.go @@ -0,0 +1,82 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cgo provides interfaces definition to interact with third-party +// network stack. It also implements CGO wrappers to handle Golang arguments +// to CGO and CGO return values to Golang. +// +// Third-party external network stack will implement interfaces defined in this +// package in order to be used by gVisor. +package cgo + +/* +#include + +// stack initialization operations +int plugin_initstack(char *init_str, int *fds, int num); +int plugin_preinitstack(int pid, char **init_str_ptr, int **fds, int *num); +*/ +import "C" + +import ( + "fmt" + "unsafe" +) + +// InitStack implements CGO wrapper for plugin_initstack. +func InitStack(initStr string, fds []int) error { + cs := C.CString(initStr) + defer C.free(unsafe.Pointer(cs)) + fdNum := len(fds) + cfds := make([]C.int, fdNum) + for i := 0; i < fdNum; i++ { + cfds[i] = (C.int)(fds[i]) + } + + if ret := C.plugin_initstack(cs, (*C.int)(&cfds[0]), (C.int)(fdNum)); ret != 0 { + return fmt.Errorf("failed to init stack, ret = %v", ret) + } + + return nil +} + +// PreInitStack implements CGO wrapper for plugin_preinitstack. +func PreInitStack(pid int) (string, []int, error) { + var ( + cInitStr *C.char + cFdArray *C.int + num C.int + ) + + if ret := C.plugin_preinitstack( + C.int(pid), + (**C.char)(unsafe.Pointer(&cInitStr)), + (**C.int)(unsafe.Pointer(&cFdArray)), + (*C.int)(unsafe.Pointer(&num))); ret != 0 { + return "", nil, fmt.Errorf("failed to prepare init args for the stack, ret = %v", ret) + } + + defer func() { + C.free(unsafe.Pointer(cInitStr)) + C.free(unsafe.Pointer(cFdArray)) + }() + + initStr := C.GoString(cInitStr) + fds := make([]int, int(num)) + cFds := unsafe.Slice(cFdArray, num) + for i := 0; i < int(num); i++ { + fds[i] = int(cFds[i]) + } + return initStr, fds, nil +} diff --git a/pkg/sentry/socket/plugin/cgo/util_unsafe.go b/pkg/sentry/socket/plugin/cgo/util_unsafe.go new file mode 100644 index 0000000000..c53ce96829 --- /dev/null +++ b/pkg/sentry/socket/plugin/cgo/util_unsafe.go @@ -0,0 +1,36 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cgo + +import ( + "unsafe" +) + +// GetPtr gets []byte's start address and converts the address into +// unsafe.Pointer that will be used as C pointer. +func GetPtr(bs []byte) unsafe.Pointer { + if len(bs) == 0 { + return nil + } + return unsafe.Pointer(&bs[0]) +} + +func convertRetVal(ret int64, errno uint64) int64 { + if ret < 0 { + return -int64(errno) + } else { + return ret + } +} diff --git a/pkg/sentry/socket/plugin/config.go b/pkg/sentry/socket/plugin/config.go new file mode 100644 index 0000000000..8c981f6686 --- /dev/null +++ b/pkg/sentry/socket/plugin/config.go @@ -0,0 +1,36 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package plugin + +import ( + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/seccomp" +) + +// PluginFilters defines seccomp allowed rules that are needed by cgo. +func PluginFilters() seccomp.SyscallRules { + return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ + unix.SYS_MMAP: seccomp.PerArg{ + // allow alloc_seg in DPDK + seccomp.AnyValue{}, + seccomp.AnyValue{}, + seccomp.AnyValue{}, + seccomp.EqualTo( + unix.MAP_SHARED | + unix.MAP_ANONYMOUS | + unix.MAP_FIXED), + }, + }) +} diff --git a/pkg/sentry/socket/plugin/plugin.go b/pkg/sentry/socket/plugin/plugin.go new file mode 100644 index 0000000000..e8da54ec82 --- /dev/null +++ b/pkg/sentry/socket/plugin/plugin.go @@ -0,0 +1,101 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package plugin provides a set of interfaces to interact with +// third-party netstack. It will be used during sandbox network setup when +// NetworkType is set as NetworkPlugin. +package plugin + +import ( + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/waiter" +) + +// PluginStack defines a set of stack operations to work with a third-party +// plugin stack. +type PluginStack interface { + inet.Stack + + // Init initializes plugin stack. + Init(args *InitStackArgs) error + + // PreInit handles prepare steps before initializing plugin stack. + // It may include joining namespace, mounting NIC, etc. + PreInit(args *PreInitStackArgs) (string, []int, error) +} + +// InitStackArgs is a struct that holds arguments needed by PluginStack.Init. +type InitStackArgs struct { + // InitStr represents arguments needed to initialize plugin stack. + InitStr string + + // FDs represents files opened during stack pre-init stage, which will + // be used in stack initialization. + FDs []int +} + +// PreInitStackArgs is a struct that holds arguments needed by +// PluginStack.PreInit. +type PreInitStackArgs struct { + // Pid represents current process that invokes plugin stack + // pre-init. + Pid int +} + +var pluginStack PluginStack + +// RegisterPluginStack registers given stack as plugin stack. +func RegisterPluginStack(stack PluginStack) { + if pluginStack != nil { + panic("called RegisterPluginStack more than once") + } + pluginStack = stack +} + +// GetPluginStack fetches the current registered plugin stack. +func GetPluginStack() PluginStack { + return pluginStack +} + +// EventInfo is a struct that holds information necessary to a socket +// notification mechanisms. +type EventInfo struct { + // Queue is the socket corresponding event queue. + Wq *waiter.Queue + + // Mask represents events this socket registered. + Mask waiter.EventMask + + // Ready represents events has been currently reported. + Ready waiter.EventMask + + // Waiting represents whether there is any waiting event. + Waiting bool +} + +// PluginNotifier represents a set of operations to handle +// plugin network stack's event notification mechanisms. +type PluginNotifier interface { + // AddFD registers a new socket fd and its corresponding + // event notification info into the global fdMap. + AddFD(fd uint32, eventinfo *EventInfo) error + + // RemoveFD unregisters a socket fd and its corresponding + // event notification info from the global fdMap. + RemoveFD(fd uint32) + + // UpdateFD updates the set of events the socket fd needs + // to be notified on. + UpdateFD(fd uint32) error +} diff --git a/pkg/sentry/socket/plugin/stack/BUILD b/pkg/sentry/socket/plugin/stack/BUILD new file mode 100644 index 0000000000..3a3842dafa --- /dev/null +++ b/pkg/sentry/socket/plugin/stack/BUILD @@ -0,0 +1,45 @@ +package(licenses = ["notice"]) + +load("//tools:defs.bzl", "go_library") + +go_library( + name = "stack", + srcs = [ + "notifier.go", + "provider.go", + "readwriter.go", + "socket.go", + "stack.go", + "util.go", + ], + visibility = ["//visibility:public"], + deps = [ + "//pkg/abi/linux", + "//pkg/abi/linux/errno", + "//pkg/binary", + "//pkg/context", + "//pkg/errors/linuxerr", + "//pkg/hostarch", + "//pkg/marshal", + "//pkg/marshal/primitive", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/socket", + "//pkg/sentry/socket/plugin", + "//pkg/sentry/socket/plugin/cgo", + "//pkg/sentry/unimpl", + "//pkg/sentry/vfs", + "//pkg/syserr", + "//pkg/tcpip", + "//pkg/tcpip/network/ipv4", + "//pkg/tcpip/network/ipv6", + "//pkg/usermem", + "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/sentry/socket/plugin/stack/notifier.go b/pkg/sentry/socket/plugin/stack/notifier.go new file mode 100644 index 0000000000..3d7f55f555 --- /dev/null +++ b/pkg/sentry/socket/plugin/stack/notifier.go @@ -0,0 +1,165 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "fmt" + "runtime" + "sync" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin/cgo" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Notifier holds all the state necessary to issue notifications when +// IO events occur on the observed FDs in plugin stack. +type Notifier struct { + // the epoll FD used to register for io notifications. + epFD int32 + + // mu protects eventMap. + mu sync.Mutex + + // eventMap maps file descriptors to their notification queues + // and waiting status. + eventMap map[uint32]*plugin.EventInfo +} + +const ( + MaxEpollEvents = 128 + SleepInMsecond = 100 +) + +// NewNotifier initialize the event notifier for plugin stack. +// It will allocate a eventMap with fd as key and corresponding eventInfo +// as value and start a goroutine waiting the arrival of events. +func NewNotifier() *Notifier { + ioInit := make(chan int32) + + n := &Notifier{ + eventMap: make(map[uint32]*plugin.EventInfo), + } + + go n.waitAndNotify(ioInit) + + epFD := <-ioInit + if epFD < 0 { + return nil + } + return n +} + +// AddFD implements plugin.PluginNotifier.AddFD. +func (n *Notifier) AddFD(fd uint32, eventInfo *plugin.EventInfo) { + n.mu.Lock() + defer n.mu.Unlock() + + // Panic if we're already notifying on this FD. + if _, ok := n.eventMap[fd]; ok { + panic(fmt.Sprintf("File descriptor %d added twice", fd)) + } + + // We have nothing to wait for at the moment. Just add it to the map. + n.eventMap[fd] = eventInfo +} + +// RemoveFD implements plugin.PluginNotifier.RemoveFD. +func (n *Notifier) RemoveFD(fd uint32) { + n.mu.Lock() + defer n.mu.Unlock() + delete(n.eventMap, fd) +} + +// UpdateFD implements plugin.PluginNotifier.UpdateFD. +func (n *Notifier) UpdateFD(fd uint32) { + n.mu.Lock() + defer n.mu.Unlock() + + if eventInfo, ok := n.eventMap[fd]; ok { + n.waitFD(fd, eventInfo) + } +} + +// waitAndNotify loops waiting for io event notifications from the epoll +// object. Once notifications arrive, they are dispatched to the +// registered queue. +func (n *Notifier) waitAndNotify(ioInit chan int32) error { + // plugin stack leverages TLS varaibles, so bind this goroutine with + // one specific OS thread + runtime.LockOSThread() + + // If current thread is not the main thread, change the thread name. + if syscall.Getpid() != syscall.Gettid() { + threadName := []byte("io-thread\x00") + if err := unix.Prctl(unix.PR_SET_NAME, uintptr(cgo.GetPtr(threadName)), 0, 0, 0); err != nil { + return err + } + } + + n.epFD = int32(cgo.EpollCreate()) + + ioInit <- n.epFD + + var events [MaxEpollEvents]syscall.EpollEvent + for { + num := cgo.EpollWait(n.epFD, events[:], MaxEpollEvents, SleepInMsecond) + if num <= 0 { + continue + } + + n.mu.Lock() + for i := 0; i < num; i++ { + h := uint32(events[i].Fd) + eventInfo, ok := n.eventMap[h] + if !ok { + continue + } + + ev := waiter.EventMask(events[i].Events) + eventInfo.Ready |= ev & (eventInfo.Mask | waiter.EventErr | waiter.EventHUp) + // When an error occurred, invoke all events + if ev&(waiter.EventErr|waiter.EventHUp) != 0 { + ev |= waiter.EventIn | waiter.EventOut + } + eventInfo.Wq.Notify(ev) + } + n.mu.Unlock() + } +} + +func (n *Notifier) waitFD(fd uint32, eventInfo *plugin.EventInfo) { + mask := eventInfo.Wq.Events() + + eventInfo.Mask = mask + if !eventInfo.Waiting && mask == 0 { + return + } + + switch { + case !eventInfo.Waiting && mask != 0: + cgo.EpollCtl(n.epFD, syscall.EPOLL_CTL_ADD, fd, uint32(mask)) + eventInfo.Waiting = true + case eventInfo.Waiting && mask == 0: + cgo.EpollCtl(n.epFD, syscall.EPOLL_CTL_DEL, fd, uint32(mask)) + eventInfo.Ready = 0 + eventInfo.Waiting = false + case eventInfo.Waiting && mask != 0: + cgo.EpollCtl(n.epFD, syscall.EPOLL_CTL_MOD, fd, uint32(mask)) + eventInfo.Ready &= mask + } +} diff --git a/pkg/sentry/socket/plugin/stack/provider.go b/pkg/sentry/socket/plugin/stack/provider.go new file mode 100644 index 0000000000..4e46560635 --- /dev/null +++ b/pkg/sentry/socket/plugin/stack/provider.go @@ -0,0 +1,105 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin/cgo" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" +) + +type provider struct { + family int + netProto tcpip.NetworkProtocolNumber +} + +// Socket creates a new socket object for the AF_INET or AF_INET6 family. +func (p *provider) Socket(t *kernel.Task, skType linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Fail right away if there is no plugin stack registered. + ctx := t.NetworkContext() + if ctx == nil { + return nil, nil + } + _, ok := ctx.(*Stack) + if !ok { + return nil, nil + } + + // Only accept TCP and UDP. + stype := skType & linux.SOCK_TYPE_MASK + switch stype { + case syscall.SOCK_STREAM: + switch protocol { + case 0, syscall.IPPROTO_TCP: + default: + return nil, syserr.ErrProtocolNotSupported + } + case syscall.SOCK_DGRAM: + switch protocol { + case 0, syscall.IPPROTO_UDP: + default: + return nil, syserr.ErrProtocolNotSupported + } + case syscall.SOCK_RAW: + // Raw sockets require CAP_NET_RAW. + creds := auth.CredentialsFromContext(t) + if !creds.HasCapability(linux.CAP_NET_RAW) { + return nil, syserr.ErrPermissionDenied + } + default: + return nil, syserr.ErrSocketNotSupported + } + + handle := cgo.Socket(p.family, int(skType), protocol) + if handle < 0 { + return nil, int2err(handle) + } + + fd, err := newSocket(t, p.family, skType, protocol, stack.notifier, int(handle), uint32(skType&syscall.SOCK_NONBLOCK)) + return fd, err +} + +// Pair just returns nil sockets (not supported). +func (*provider) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + return nil, nil, nil +} + +func init() { + // Providers backed by plugin stack. + p := []provider{ + { + family: linux.AF_INET, + netProto: ipv4.ProtocolNumber, + }, + + { + family: linux.AF_INET6, + netProto: ipv6.ProtocolNumber, + }, + } + + for i := range p { + socket.RegisterProvider(p[i].family, &p[i]) + } +} diff --git a/pkg/sentry/socket/plugin/stack/readwriter.go b/pkg/sentry/socket/plugin/stack/readwriter.go new file mode 100644 index 0000000000..db9455a087 --- /dev/null +++ b/pkg/sentry/socket/plugin/stack/readwriter.go @@ -0,0 +1,110 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "sync" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin/cgo" +) + +type pluginStackRW struct { + handle uint32 + + // Represent both input and output flags. + flags uint32 + + // Reused as msg_control for read. + to []byte + + iovs [3]syscall.Iovec +} + +var pluginStackRWPool = sync.Pool{ + New: func() interface{} { + return &pluginStackRW{} + }, +} + +func getReadWriter(handle uint32) *pluginStackRW { + rw := pluginStackRWPool.Get().(*pluginStackRW) + rw.handle = handle + return rw +} + +func putReadWriter(rw *pluginStackRW) { + *rw = pluginStackRW{} + pluginStackRWPool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *pluginStackRW) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + // Set MSG_DONTWAIT flag to avoid blocking in plugin stack. + flags := int(rw.flags) & ^linux.MSG_DONTWAIT + if len(rw.to) != 0 || flags != 0 { + iovs := iovecsFromBlockSeq(dsts, rw) + rc, _, lc, mflags := cgo.Recvmsg(rw.handle, iovs, nil, rw.to, int(rw.flags)) + if rc >= 0 { + rw.to = rw.to[:lc] + rw.flags = uint32(mflags) + } + return translateReturn(rc) + } + + var rc int64 + if dsts.IsEmpty() { + rc = 0 + } else if dsts.NumBlocks() == 1 { + rc = cgo.Read(rw.handle, dsts.Head().Addr(), dsts.Head().Len()) + } else { + rc = cgo.Readv(rw.handle, iovecsFromBlockSeq(dsts, rw)) + } + + return translateReturn(rc) +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// Preconditions: rw.d.metadataMu must be locked. +func (rw *pluginStackRW) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + var rc int64 + + if rw.to != nil { + if srcs.IsEmpty() { + // Invoke plugin stack checking whether there is any error to report + // on target socket which sends 0-length data. + rc = cgo.Sendto(rw.handle, 0, 0, 0, rw.to) + } else if srcs.NumBlocks() == 1 { + rc = cgo.Sendto(rw.handle, srcs.Head().Addr(), srcs.Head().Len(), 0, rw.to) + } else { + iovs := iovecsFromBlockSeq(srcs, rw) + rc = cgo.Sendmsg(rw.handle, iovs, rw.to, 0) + } + } else { + if srcs.IsEmpty() { + // Invoke plugin stack checking whether there is any error to report + // on target socket which sends 0-length data. + rc = cgo.Write(rw.handle, 0, 0) + } else if srcs.NumBlocks() == 1 { + rc = cgo.Write(rw.handle, srcs.Head().Addr(), srcs.Head().Len()) + } else { + rc = cgo.Writev(rw.handle, iovecsFromBlockSeq(srcs, rw)) + } + } + return translateReturn(rc) +} diff --git a/pkg/sentry/socket/plugin/stack/socket.go b/pkg/sentry/socket/plugin/stack/socket.go new file mode 100644 index 0000000000..1e8ca2446e --- /dev/null +++ b/pkg/sentry/socket/plugin/stack/socket.go @@ -0,0 +1,854 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/abi/linux/errno" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin/cgo" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +type socketOperations struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD + socket.SendReceiveTimeout + + family int + skType linux.SockType + protocol int + + // fd holds the current socket fd created by plugin stack. + fd uint32 `state:"nosave"` + + // eventInfo holds current socket fd's corresponding notification queue + // and events status. It will be used to interact with plugin + // network stack for event reporting. + eventInfo plugin.EventInfo +} + +var _ = socket.Socket(&socketOperations{}) + +const ( + sizeofSockaddr = syscall.SizeofSockaddrInet6 + + // Lo IF index. + ifLoIndex = 1 +) + +func newSocket(t *kernel.Task, family int, skType linux.SockType, protocol int, notifier *Notifier, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) { + mnt := t.Kernel().SocketMount() + d := sockfs.NewDentry(t, mnt) + defer d.DecRef(t) + + switch skType { + case syscall.SOCK_STREAM: + protocol = syscall.IPPROTO_TCP + case syscall.SOCK_DGRAM: + protocol = syscall.IPPROTO_UDP + default: + return nil, syserr.ErrSocketNotSupported + } + + wq := &waiter.Queue{} + sop := &socketOperations{ + family: family, + fd: uint32(fd), + protocol: protocol, + skType: skType, + eventInfo: plugin.EventInfo{Wq: wq}, + } + + sop.LockFD.Init(&vfs.FileLocks{}) + + vfsfd := &sop.vfsfd + if err := vfsfd.Init(sop, linux.O_RDWR|(flags&linux.O_NONBLOCK), mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + + notifier.AddFD(uint32(fd), &sop.eventInfo) + + return vfsfd, nil +} + +// Bind implements socket.Socket.Bind. +func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + return int2err(cgo.Bind(s.fd, sockaddr)) +} + +// Listen implements socket.Socket.Listen. +func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { + return int2err(cgo.Listen(s.fd, backlog)) +} + +// Accept implements socket.Socket.Accept. +func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + peerAddrBuf := make([]byte, sizeofSockaddr) + peerAddrlen := uint32(len(peerAddrBuf)) + peerAddrPtr := &peerAddrBuf[0] + peerAddrlenPtr := &peerAddrlen + + rc := cgo.Accept(s.fd, peerAddrPtr, peerAddrlenPtr) + if blocking { + for rc < 0 && (-rc) == errno.EAGAIN { + if err := s.waitEvent(t, waiter.EventIn); err != nil { + return 0, nil, 0, syserr.FromError(err) + } + + rc = cgo.Accept(s.fd, peerAddrPtr, peerAddrlenPtr) + } + } + + if rc < 0 { + return 0, nil, 0, int2err(rc) + } + + f, err := newSocket(t, s.family, s.skType, s.protocol, stack.notifier, int(rc), uint32(flags&syscall.SOCK_NONBLOCK)) + if err != nil { + cgo.Close(uint32(rc)) + return 0, nil, 0, err + } + defer f.DecRef(t) + + kfd, kerr := t.NewFDFrom(0, f, kernel.FDFlags{ + CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, + }) + if kerr != nil { + cgo.Close(uint32(rc)) + return 0, nil, 0, syserr.FromError(kerr) + } + + t.Kernel().RecordSocket(f) + + if !peerRequested { + return kfd, nil, 0, nil + } + + peerAddr := socket.UnmarshalSockAddr(s.family, peerAddrBuf[:peerAddrlen]) + return kfd, peerAddr, peerAddrlen, nil +} + +// Connect implements socket.Socket.Connect. +func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { + var ret int64 + + if !blocking { + ret = cgo.Connect(s.fd, sockaddr) + + if ret == 0 { + return nil + } + return int2err(ret) + } + + ret = cgo.Connect(s.fd, sockaddr) + + if ret == 0 { + return nil + } else if ret < 0 && (-ret) != errno.EINPROGRESS { + // Return error if errno is EALREADY/EISCONN/ECONNREFUSED/EINVAL + return int2err(ret) + } + + if err := s.waitEvent(t, waiter.EventOut); err != nil { + return syserr.FromError(err) + } + + // Call connect() again after blocking to find connect's result. + ret = cgo.Connect(s.fd, sockaddr) + if ret == 0 { + return nil + } else if ret < 0 && (-ret) == errno.EISCONN { + return nil + } + return int2err(ret) +} + +// Shutdown implements socket.Socket.Shutdown. +func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { + ret := cgo.Shutdown(s.fd, how) + return int2err(ret) +} + +// GetSockOpt implements socket.Socket.GetSockOpt. +func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { + optVal := make([]byte, outLen) + if outLen == 0 { + optPtr := primitive.ByteSlice(optVal) + return &optPtr, nil + } + rc, outLen := cgo.Getsockopt(s.fd, level, name, optVal, outLen) + if rc < 0 { + return nil, int2err(rc) + } + optPtr := primitive.ByteSlice(optVal[:outLen]) + return &optPtr, nil +} + +// SetSockOpt implements socket.Socket.SetSockOpt. +func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { + rc := cgo.Setsockopt(s.fd, level, name, optVal) + return int2err(rc) +} + +// State implements socket.Socket.State. +func (s *socketOperations) State() uint32 { + optVal := make([]byte, 1) + rc, _ := cgo.Getsockopt(s.fd, syscall.SOL_TCP, syscall.TCP_INFO, optVal, 1) + if rc < 0 { + return 0 + } + return uint32(optVal[0]) +} + +// Type implements socket.Socket.Type. +func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) { + return s.family, s.skType, s.protocol +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (s *socketOperations) OnClose(ctx context.Context) error { + return nil +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *socketOperations) EventRegister(e *waiter.Entry) error { + s.eventInfo.Wq.EventRegister(e) + stack.notifier.UpdateFD(s.fd) + return nil +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *socketOperations) EventUnregister(e *waiter.Entry) { + s.eventInfo.Wq.EventUnregister(e) + stack.notifier.UpdateFD(s.fd) +} + +// Readiness implements socket.Socket.Readiness. +func (s *socketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + var events waiter.EventMask + + evInfo := &s.eventInfo + iomask := mask & (waiter.EventIn | waiter.EventOut) + + // Fast path condition: + // 1. The event needed has been reported by plugin stack io-thread; or + // 2. POLLIN or POLLOUT event has been reported by io-thread. + // Directly report current event without invoking cgo Readiness again. + if evInfo.Ready&iomask == iomask { + events = evInfo.Ready & mask + // Clear plugin stack eventInfo record after consuming IN/OUT event. + evInfo.Ready &= ^iomask + } else { + events = waiter.EventMask(cgo.Readiness(s.fd, uint64(mask))) + } + + return events +} + +// Epollable implements socket.Socket.Epollable. +func (s *socketOperations) Epollable() bool { + return true +} + +// Refers to implementation in epsocket +func interfaceIoctl(ctx context.Context, io usermem.IO, cmd uint32, ifr *linux.IFReq) *syserr.Error { + stack := inet.StackFromContext(ctx) + if stack == nil { + return syserr.ErrNoDevice + } + + // SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to + // identify a device. + if cmd == syscall.SIOCGIFNAME { + // Gets the name of the interface given the interface index + // stored in ifr_ifindex. + index := int32(hostarch.ByteOrder.Uint32(ifr.Data[:4])) + if iface, ok := stack.Interfaces()[index]; ok { + ifr.SetName(iface.Name) + return nil + } + return syserr.ErrNoDevice + } + + // Find the relevant device. + var iface inet.Interface + index := int32(-1) + for i, iface := range stack.Interfaces() { + if iface.Name == ifr.Name() { + index = i + break + } + } + if index == -1 { + return syserr.ErrNoDevice + } + + switch cmd { + case syscall.SIOCGIFINDEX: + // Copy out the index to the data. + hostarch.ByteOrder.PutUint32(ifr.Data[:], uint32(index)) + + case syscall.SIOCGIFHWADDR: + // Use IEEE802.3 instead of IEEE802.2 ARP type + // so that ifconfig command can recognize it + devType := 1 + if index == ifLoIndex { + devType = 772 // Loopback + } + hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(devType)) + n := copy(ifr.Data[2:], iface.Addr) + for i := 2 + n; i < len(ifr.Data); i++ { + ifr.Data[i] = 0 // Clear padding. + } + + case syscall.SIOCGIFFLAGS: + f := iface.Flags + // Drop the flags that don't fit in the size that we need to + // return. This matches Linux behavior. + hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(f)) + + case syscall.SIOCGIFADDR: + // Copy the IPv4 address out. + for _, addr := range stack.InterfaceAddrs()[index] { + // This ioctl is only compatible with AF_INET addresses. + if addr.Family != linux.AF_INET { + continue + } + copyAddrOut(ifr, &addr) + break + } + + case syscall.SIOCGIFMETRIC: + // Gets the metric of the device. As per netdevice(7), this + // always just sets ifr_metric to 0. + hostarch.ByteOrder.PutUint32(ifr.Data[:4], 0) + + case syscall.SIOCGIFMTU: + // Gets the MTU of the device. + hostarch.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU) + + case syscall.SIOCGIFMAP: + // Gets the hardware parameters of the device. + hostarch.ByteOrder.PutUint64(ifr.Data[:8], 0) + hostarch.ByteOrder.PutUint32(ifr.Data[8:12], 0) + ifr.Data[12] = 0 + + case syscall.SIOCGIFTXQLEN: + hostarch.ByteOrder.PutUint32(ifr.Data[:4], 1024) + + case syscall.SIOCGIFDSTADDR: + // Gets the destination address of a point-to-point device. + // TODO: Implement SIOCGIFDSTADDR handler. + + case syscall.SIOCGIFBRDADDR: + // Gets the broadcast address of a device. + // TODO: Implement SIOCGIFBRDADDR handler. + + case syscall.SIOCGIFNETMASK: + // Gets the network mask of a device. + for _, addr := range stack.InterfaceAddrs()[index] { + // This ioctl is only compatible with AF_INET addresses. + if addr.Family != linux.AF_INET { + continue + } + // Populate ifr.ifr_netmask (type sockaddr). + hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(addr.Family)) + hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0) + // Netmask is expected to be returned as a big endian value. + mask := uint32(0xffffffff << (32 - addr.PrefixLen)) + binary.BigEndian.PutUint32(ifr.Data[4:8], mask) + break + } + + default: + // Not a valid call. + return syserr.ErrInvalidArgument + } + + return nil +} + +func ifconfIoctlFromStack(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error { + // If Ptr is NULL, return the necessary buffer size via Len. + // Otherwise, write up to Len bytes starting at Ptr containing ifreq + // structs. + t := ctx.(*kernel.Task) + s := t.NetworkContext().(*Stack) + if s == nil { + return syserr.ErrNoDevice.ToError() + } + + if ifc.Ptr == 0 { + ifc.Len = int32(len(s.Interfaces())) * int32(linux.SizeOfIFReq) + return nil + } + + max := ifc.Len + ifc.Len = 0 + for idx, iface := range s.Interfaces() { + ifaceAddrs := s.InterfaceAddrs()[idx] + for _, ifaceAddr := range ifaceAddrs { + if ifaceAddr.Family != syscall.AF_INET { + continue + } + + // Don't write past the end of the buffer. + if ifc.Len+int32(linux.SizeOfIFReq) > max { + break + } + + // Populate ifr.ifr_addr. + ifr := linux.IFReq{} + ifr.SetName(iface.Name) + copyAddrOut(&ifr, &ifaceAddr) + + // Copy the ifr to userspace. + dst := uintptr(ifc.Ptr) + uintptr(ifc.Len) + ifc.Len += int32(linux.SizeOfIFReq) + if _, err := ifr.CopyOut(t, hostarch.Addr(dst)); err != nil { + return err + } + } + } + return nil +} + +// Ioctl implements socket.Socket.Ioctl. +func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { + cmd := uint32(args[1].Int()) + arg := args[2].Pointer() + + var buf []byte + switch cmd { + case syscall.SIOCGIFADDR, + syscall.SIOCGIFBRDADDR, + syscall.SIOCGIFDSTADDR, + syscall.SIOCGIFFLAGS, + syscall.SIOCGIFHWADDR, + syscall.SIOCGIFINDEX, + syscall.SIOCGIFMAP, + syscall.SIOCGIFMETRIC, + syscall.SIOCGIFMTU, + syscall.SIOCGIFNAME, + syscall.SIOCGIFNETMASK, + syscall.SIOCGIFTXQLEN: + var ifr linux.IFReq + ifrBuf := ctx.(*kernel.Task).CopyScratchBuffer(linux.SizeOfIFReq) + if _, err := io.CopyIn(ctx, args[2].Pointer(), ifrBuf, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + // Decode ifr from ifrBuf + // Note that these code may need to be modified if linux.IFReq struct changes + copy(ifr.IFName[0:linux.IFNAMSIZ], ifrBuf[0:linux.IFNAMSIZ]) + copy(ifr.Data[0:linux.SizeOfIFReq-linux.IFNAMSIZ], ifrBuf[linux.IFNAMSIZ:linux.SizeOfIFReq]) + if err := interfaceIoctl(ctx, io, cmd, &ifr); err != nil { + return 0, err.ToError() + } + copy(ifrBuf[0:linux.IFNAMSIZ], ifr.IFName[0:linux.IFNAMSIZ]) + copy(ifrBuf[linux.IFNAMSIZ:linux.SizeOfIFReq], ifr.Data[0:linux.SizeOfIFReq-linux.IFNAMSIZ]) + _, err := io.CopyOut(ctx, arg, ifrBuf, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case syscall.SIOCGIFCONF: + // SIOCGIFCONF has slightly different behavior than the others, in that it + // will need to populate the array of ifreqs. + var ifc linux.IFConf + ifcBuf := ctx.(*kernel.Task).CopyScratchBuffer(linux.SizeOfIFConf) + if _, err := io.CopyIn(ctx, arg, ifcBuf, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + // Decode ifc from ifcBuf + // Note that these code may need to be modified if linux.IFconf struct changes + ifc.Len = int32(hostarch.ByteOrder.Uint32(ifcBuf[0:4])) + ifc.Ptr = hostarch.ByteOrder.Uint64(ifcBuf[8:]) + + if err := ifconfIoctlFromStack(ctx, io, &ifc); err != nil { + return 0, err + } + hostarch.ByteOrder.PutUint32(ifcBuf[0:4], uint32(ifc.Len)) + hostarch.ByteOrder.PutUint64(ifcBuf[8:], ifc.Ptr) + _, err := io.CopyOut(ctx, arg, ifcBuf, usermem.IOOpts{ + AddressSpaceActive: true, + }) + + return 0, err + case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG: + unimpl.EmitUnimplementedEvent(ctx, sysno) + return 0, linuxerr.ENOTTY + case syscall.TIOCINQ, syscall.TIOCOUTQ: + buf = make([]byte, 4) + case syscall.SIOCGSTAMP: + buf = make([]byte, 16) + default: + return 0, linuxerr.ENOTTY + } + + rc := cgo.Ioctl(s.fd, cmd, buf) + if rc < 0 { + _, err := translateReturn(int64(rc)) + return 0, err + } + + _, err := io.CopyOut(ctx, arg, buf, usermem.IOOpts{ + AddressSpaceActive: true, + }) + + return 0, err +} + +// Release implements socket.Socket.Release. +func (s *socketOperations) Release(ctx context.Context) { + t := kernel.TaskFromContext(ctx) + t.Kernel().DeleteSocket(&s.vfsfd) + stack.notifier.RemoveFD(s.fd) + cgo.Close(s.fd) +} + +// GetSockName implements socket.Socket.GetSockName. +func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { + addrlen := uint32(sizeofSockaddr) + addr := make([]byte, sizeofSockaddr) + rc := cgo.Getsockname(s.fd, addr, &addrlen) + if rc < 0 { + return nil, 0, int2err(rc) + } + return socket.UnmarshalSockAddr(s.family, addr), addrlen, nil +} + +// GetPeerName implements socket.Socket.GetPeerName. +func (s *socketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { + addrlen := uint32(sizeofSockaddr) + addr := make([]byte, sizeofSockaddr) + rc := cgo.GetPeername(s.fd, addr, &addrlen) + if rc < 0 { + return nil, 0, int2err(rc) + } + return socket.UnmarshalSockAddr(s.family, addr), addrlen, nil +} + +// recv is a helper function for doing non-blocking read once. +// It returns: +// 1. number of bytes received; +// 2. sender address for non-stream socket; +// 3. control message; +// 4. message flag; +// 5. internal error if any +func (s *socketOperations) recv(t *kernel.Task, dst usermem.IOSequence, sysflags int, addr []byte, control []byte) (int64, []byte, []byte, int, error) { + bytes := dst.NumBytes() + // Directly return if dst is empty. + if bytes == 0 { + return 0, addr, control, 0, nil + } + + // Slow path for non-stream socket(e.g., UDP, raw socket). + if s.skType != linux.SOCK_STREAM { + if len(addr) == 0 && s.skType == linux.SOCK_DGRAM { + addr = make([]byte, sizeofSockaddr) + } + + tmpBuf := make([]byte, bytes) + tmpBS := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(tmpBuf)) + iovs := iovecsFromBlockSeq(tmpBS, nil) + rc, la, lc, flag := cgo.Recvmsg(s.fd, iovs, addr, control, sysflags) + if rc < 0 { + _, err := translateReturn(rc) + return 0, addr /* return the original slice */, control /* return the original slice */, 0, err + } + + n, err := dst.CopyOut(t, tmpBuf[:rc]) + return int64(n), addr[:la], control[:lc], flag, err + } + + // Fast path for stream socket(e.g., TCP socket). + rw := getReadWriter(s.fd) + rw.to = control + rw.flags = uint32(sysflags) + + n, err := dst.CopyOutFrom(t, rw) + + control = rw.to + msg_flags := int(rw.flags) + putReadWriter(rw) + + return n, nil /* ignore for TCP */, control, msg_flags, err +} + +// RecvMsg implements socket.Socket.RecvMsg. +func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { + var addr, control []byte + if senderRequested { + if s.skType != linux.SOCK_STREAM { + addr = make([]byte, sizeofSockaddr) + } else { + // According to UNIX98, msg_name/msg_namelen are ignored on connected socket. + senderRequested = false + } + } + if controlDataLen > 0 { + control = make([]byte, controlDataLen) + } + + // Store nonblock info from original flag. + nonblock := flags&syscall.MSG_DONTWAIT != 0 + waitall := nonblock == false && flags&syscall.MSG_WAITALL != 0 + + var mflag int + var n int64 + var err error + // Always set as non-blocking for recv. + sysflags := flags | syscall.MSG_DONTWAIT + n, addr, control, mflag, err = s.recv(t, dst, sysflags, addr, control) + + var senderAddr linux.SockAddr + var addrlen uint32 + // For non-blocking RecvMsg, we could return after recv once. + if nonblock { + if err != nil { + // Clear controlMessages when error occurs. + control = nil + } else if senderRequested && len(addr) > 0 { + senderAddr = socket.UnmarshalSockAddr(s.family, addr) + addrlen = uint32(len(addr)) + } + controlMessages := buildControlMessage(control) + // The socket is shutdown, report ErrWouldBlock for non-blocking mode. + if err == error(syscall.ESHUTDOWN) { + err = linuxerr.ErrWouldBlock + } + return int(n), mflag, senderAddr, addrlen, *controlMessages, syserr.FromError(err) + } + + // For blocking RecvMsg, we need to finish reading all data before return. + rn := n + for err == linuxerr.ErrWouldBlock || (waitall && err == nil && rn < dst.NumBytes()) { + dst = dst.DropFirst(int(rn)) + // Wait on POLLIN event. + if haveDeadline { + err = s.waitEventT(t, waiter.EventIn, deadline) + } else { + err = s.waitEvent(t, waiter.EventIn) + } + if err != nil { + if n == 0 { + // There is no POLLIN event reported before timeout. + if err == linuxerr.ETIMEDOUT { + err = linuxerr.ErrWouldBlock + } + return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) + } + // Clear event-waiting error. + err = nil + mflag = 0 + break + } + rn, addr, control, mflag, err = s.recv(t, dst, sysflags, addr, control) + n += rn + } + + // Plugin stack finds the socket has been shutdown, + // report ESHUTDOWN through controlMessages. + if err == error(syscall.ESHUTDOWN) { + n = 0 + err = nil + } else if n > 0 { + // If any data has been received, clear internal error. + err = nil + } + + // For receiving error other than ESHUTDOWN and not receiving any data, + // clear controlMessages and report through internal error. + if err != nil { + control = nil + } else if senderRequested && len(addr) > 0 { + senderAddr = socket.UnmarshalSockAddr(s.family, addr) + addrlen = uint32(len(addr)) + } + controlMessages := buildControlMessage(control) + return int(n), mflag, senderAddr, addrlen, *controlMessages, syserr.FromError(err) +} + +// Read implements socket.Socket.Read. +func (s *socketOperations) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + if opts.Flags != 0 { + return 0, linuxerr.EOPNOTSUPP + } + nonblock := (s.vfsfd.StatusFlags() & linux.O_NONBLOCK) != 0 + + bytes := dst.NumBytes() + // Directly return if dst is empty. + if bytes == 0 { + return 0, nil + } + + rw := getReadWriter(s.fd) + n, err := dst.CopyOutFrom(ctx, rw) + putReadWriter(rw) + + if err == error(syscall.ESHUTDOWN) { + if nonblock { + err = linuxerr.ErrWouldBlock + } else { + n = 0 + err = nil + } + } + + return int64(n), err +} + +func (s *socketOperations) send(t *kernel.Task, src usermem.IOSequence, to []byte, sysflags int) (int64, error) { + bytes := src.NumBytes() + // If src is empty, let plugin stack handle sending 0-byte data based on protocol. + if bytes == 0 { + rc := cgo.Sendto(s.fd, 0, 0, sysflags, to) + ret, err := translateReturn(rc) + return int64(ret), err + } + + rw := getReadWriter(s.fd) + rw.to = to + n, err := src.CopyInTo(t, rw) + putReadWriter(rw) + return n, err +} + +// SendMsg implements socket.Socket.SendMsg. +func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { + total := src.NumBytes() + nonblock := flags & syscall.MSG_DONTWAIT + sysflags := flags | syscall.MSG_DONTWAIT + + n, err := s.send(t, src, to, sysflags) + + if nonblock != 0 { + return int(n), syserr.FromError(err) + } + + var sent int64 + for err == nil || err == linuxerr.ErrWouldBlock { + if n > 0 { + src = src.DropFirst64(n) + sent += n + } + + if sent == total { + return int(total), nil + } + + if haveDeadline { + err = s.waitEventT(t, waiter.EventOut, deadline) + } else { + err = s.waitEvent(t, waiter.EventOut) + } + + if err != nil { + if err == linuxerr.ETIMEDOUT { + err = linuxerr.ErrWouldBlock + } + return int(sent), syserr.FromError(err) + } + + n, err = s.send(t, src, to, sysflags) + } + + return int(sent), syserr.FromError(err) +} + +// Write implements socket.Socket.Write. +func (s *socketOperations) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + bytes := src.NumBytes() + // If src is empty, let plugin stack handle sending 0-byte data based on protocol. + if bytes == 0 { + rc := cgo.Write(s.fd, 0, 0) + ret, err := translateReturn(rc) + return int64(ret), err + } + + rw := getReadWriter(s.fd) + n, err := src.CopyInTo(ctx, rw) + putReadWriter(rw) + if n < bytes && err == nil { + return n, linuxerr.ErrWouldBlock + } + return int64(n), err +} + +// waitEvent implements blocking wait on given events +// until one of the events have been reported. +func (s *socketOperations) waitEvent(ctx context.Context, event waiter.EventMask) error { + var err error + t := ctx.(*kernel.Task) + e, ch := waiter.NewChannelEntry(event | waiter.EventErr | waiter.EventHUp) + s.EventRegister(&e) + + // It's possible events happens between last check and EventRegister. + // If this happens and we don't check readiness again, we would miss + // the event and get blocked forever. + if s.Readiness(event|waiter.EventErr|waiter.EventHUp) == 0 { + err = t.Block(ch) + } + + s.EventUnregister(&e) + return err +} + +// waitEventT implements blocking wait on given events +// until one of the events have been reported or timeout based on given deadline. +func (s *socketOperations) waitEventT(ctx context.Context, event waiter.EventMask, deadline ktime.Time) error { + var err error + t := ctx.(*kernel.Task) + e, ch := waiter.NewChannelEntry(event | waiter.EventErr | waiter.EventHUp) + s.EventRegister(&e) + + if s.Readiness(event|waiter.EventErr|waiter.EventHUp) == 0 { + err = t.BlockWithDeadline(ch, true, deadline) + } + + s.EventUnregister(&e) + return err +} diff --git a/pkg/sentry/socket/plugin/stack/stack.go b/pkg/sentry/socket/plugin/stack/stack.go new file mode 100644 index 0000000000..acb167e837 --- /dev/null +++ b/pkg/sentry/socket/plugin/stack/stack.go @@ -0,0 +1,86 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package stack provides an implementation of plugin.PluginStack +// interface and an implementation of socket.Socket interface. +// +// It glues sentry interfaces with plugin netstack interfaces defined in cgo. +package stack + +import ( + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin/cgo" +) + +// Stack is a struct that interacts with third-party network stack. +// It implements inet.Stack and plugin.PluginStack. +type Stack struct { + inet.Stack + + notifier *Notifier +} + +var stack *Stack + +// Init implements plugin.PluginStack.Init. +func (s *Stack) Init(args *plugin.InitStackArgs) error { + if err := cgo.InitStack(args.InitStr, args.FDs); err != nil { + return err + } + s.notifier = NewNotifier() + stack = s + return nil +} + +// PreInit implements plugin.PluginStack.PreInit. +func (s *Stack) PreInit(args *plugin.PreInitStackArgs) (string, []int, error) { + return cgo.PreInitStack(args.Pid) +} + +// Interfaces implements inet.Stack.Interfaces. +func (s *Stack) Interfaces() map[int32]inet.Interface { + // TODO: support Interfaces + return make(map[int32]inet.Interface) +} + +// InterfaceAddrs implements inet.Stack.InterfaceAddrs. +func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { + // TODO: support InterfaceAddrs + return make(map[int32][]inet.InterfaceAddr) +} + +// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. +func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { + return linuxerr.EACCES +} + +// RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr. +func (s *Stack) RemoveInterfaceAddr(int32, inet.InterfaceAddr) error { + return linuxerr.EACCES +} + +// SupportsIPv6 implements Stack.SupportsIPv6. +func (s *Stack) SupportsIPv6() bool { + return true +} + +// Destroy implements inet.Stack.Destroy. +func (*Stack) Destroy() { +} + +func init() { + plugin.RegisterPluginStack(&Stack{}) +} diff --git a/pkg/sentry/socket/plugin/stack/util.go b/pkg/sentry/socket/plugin/stack/util.go new file mode 100644 index 0000000000..d307446aff --- /dev/null +++ b/pkg/sentry/socket/plugin/stack/util.go @@ -0,0 +1,91 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "net" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/abi/linux/errno" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin/cgo" + "gvisor.dev/gvisor/pkg/syserr" +) + +func int2err(from int64) *syserr.Error { + if from >= 0 { + return nil + } + + if (-from) == errno.EAGAIN { + return syserr.ErrWouldBlock + } + + return syserr.FromHost(syscall.Errno(-from)) +} + +func translateReturn(ret int64) (uint64, error) { + if ret < 0 { + return 0, int2err(ret).ToError() + } else if ret == 0 { + return 0, nil + } else { + return uint64(ret), nil + } +} + +func copyAddrOut(ifr *linux.IFReq, ifaceAddr *inet.InterfaceAddr) { + hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family)) + hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0) // port + if ifaceAddr.Family == linux.AF_INET { + copy(ifr.Data[4:8], net.IP(ifaceAddr.Addr).To4()[:4]) + } else { + copy(ifr.Data[8:24], ifaceAddr.Addr[:16]) + } +} + +func iovecsFromBlockSeq(bs safemem.BlockSeq, rw *pluginStackRW) []syscall.Iovec { + var iovs []syscall.Iovec + if rw != nil { + // Reuse the old buffer and set length to zero. + iovs = rw.iovs[:0] + } + for ; !bs.IsEmpty(); bs = bs.Tail() { + b := bs.Head() + iovs = append(iovs, syscall.Iovec{ + Base: &b.ToSlice()[0], + Len: uint64(b.Len()), + }) + } + return iovs +} + +func buildControlMessage(controlData []byte) *socket.ControlMessages { + controlMessages := socket.ControlMessages{} + if len(controlData) >= 28 { + timebytes := controlData[12:] + timeval := (*linux.Timeval)(cgo.GetPtr(timebytes)) + m := socket.IPControlMessages{ + HasTimestamp: true, + Timestamp: timeval.ToTime(), + } + controlMessages.IP = m + } + return &controlMessages +} diff --git a/pkg/usermem/BUILD b/pkg/usermem/BUILD index 698a013e44..d285553e96 100644 --- a/pkg/usermem/BUILD +++ b/pkg/usermem/BUILD @@ -16,6 +16,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/atomicbitops", + "//pkg/binary", "//pkg/context", "//pkg/errors/linuxerr", "//pkg/gohacks", diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go index b0782155ce..7d5fc7ec68 100644 --- a/pkg/usermem/usermem.go +++ b/pkg/usermem/usermem.go @@ -21,6 +21,7 @@ import ( "io" "strconv" + "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/gohacks" @@ -569,3 +570,48 @@ func (rw *IOSequenceReadWriter) Write(src []byte) (int, error) { } return n, err } + +// CopyObjectOut copies a fixed-size value or slice of fixed-size values from +// src to the memory mapped at addr in uio. It returns the number of bytes +// copied. +// +// CopyObjectOut must use reflection to encode src; performance-sensitive +// clients should do encoding manually and use uio.CopyOut directly. +// +// Preconditions: Same as IO.CopyOut. +func CopyObjectOut(ctx context.Context, uio IO, addr hostarch.Addr, src interface{}, opts IOOpts) (int, error) { + w := &IOReadWriter{ + Ctx: ctx, + IO: uio, + Addr: addr, + Opts: opts, + } + // Allocate a byte slice the size of the object being marshaled. This + // adds an extra reflection call, but avoids needing to grow the slice + // during encoding, which can result in many heap-allocated slices. + b := make([]byte, 0, binary.Size(src)) + return w.Write(binary.Marshal(b, hostarch.ByteOrder, src)) +} + +// CopyObjectIn copies a fixed-size value or slice of fixed-size values from +// the memory mapped at addr in uio to dst. It returns the number of bytes +// copied. +// +// CopyObjectIn must use reflection to decode dst; performance-sensitive +// clients should use uio.CopyIn directly and do decoding manually. +// +// Preconditions: Same as IO.CopyIn. +func CopyObjectIn(ctx context.Context, uio IO, addr hostarch.Addr, dst interface{}, opts IOOpts) (int, error) { + r := &IOReadWriter{ + Ctx: ctx, + IO: uio, + Addr: addr, + Opts: opts, + } + buf := make([]byte, binary.Size(dst)) + if _, err := io.ReadFull(r, buf); err != nil { + return 0, err + } + binary.Unmarshal(buf, hostarch.ByteOrder, dst) + return int(r.Addr - addr), nil +} diff --git a/runsc/BUILD b/runsc/BUILD index 183a259e25..4f395ce569 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -20,6 +20,23 @@ go_binary( ], ) +go_binary( + name = "runsc-plugin-stack", + srcs = ["main_plugin_stack.go"], + pure = False, + static = True, + tags = ["staging"], + visibility = [ + "//visibility:public", + ], + x_defs = {"gvisor.dev/gvisor/runsc/version.version": "{STABLE_VERSION}"}, + deps = [ + "//runsc/cli", + "//runsc/version", + "//pkg/sentry/socket/plugin/stack", + ], +) + # The runsc-race target is a race-compatible BUILD target. This must be built # via: bazel build --features=race :runsc-race # diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index b89a4dc3df..df240ee6cc 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -89,6 +89,7 @@ go_library( "//pkg/sentry/seccheck/points:points_go_proto", "//pkg/sentry/seccheck/sinks/null", "//pkg/sentry/seccheck/sinks/remote", + "//pkg/sentry/socket/plugin", "//pkg/sentry/socket/hostinet", "//pkg/sentry/socket/netfilter", "//pkg/sentry/socket/netlink", diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index e88e09cacc..acbe64a7ff 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -35,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/seccheck" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/pkg/urpc" @@ -118,6 +119,9 @@ const ( // NetworkCreateLinksAndRoutes creates links and routes in a network stack. NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes" + // NetworkInitPluginStack initializes third-party network stack. + NetworkInitPluginStack = "Network.InitPluginStack" + // DebugStacks collects sandbox stacks for debugging. DebugStacks = "debug.Stacks" ) @@ -202,6 +206,11 @@ func (c *controller) registerHandlers() { Kernel: l.k, }) } + + if pluginStack, ok := l.k.RootNetworkNamespace().Stack().(plugin.PluginStack); ok { + ctrl.srv.Register(&Network{PluginStack: pluginStack}) + } + if l.root.conf.ProfileEnable { c.srv.Register(control.NewProfile(l.k)) } diff --git a/runsc/boot/filter/config/BUILD b/runsc/boot/filter/config/BUILD index e282c7ea7a..94fdfe82e1 100644 --- a/runsc/boot/filter/config/BUILD +++ b/runsc/boot/filter/config/BUILD @@ -11,6 +11,7 @@ go_library( "config.go", "config_amd64.go", "config_arm64.go", + "config_cgo.go", "config_main.go", "config_precompiled.go", "config_profile.go", @@ -35,6 +36,7 @@ go_library( "//pkg/sentry/devices/tpuproxy", "//pkg/sentry/platform", "//pkg/sentry/socket/hostinet", + "//pkg/sentry/socket/plugin", "//pkg/tcpip/link/fdbased", "//runsc/boot/platforms", "@org_golang_x_sync//errgroup:go_default_library", diff --git a/runsc/boot/filter/config/config.go b/runsc/boot/filter/config/config.go index 10dc0f6c0e..be11873418 100644 --- a/runsc/boot/filter/config/config.go +++ b/runsc/boot/filter/config/config.go @@ -29,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy" "gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin" ) // Options are seccomp filter related options. @@ -41,6 +42,8 @@ type Options struct { NVProxy bool TPUProxy bool ControllerFD uint32 + CgoEnabled bool + PluginNetwork bool } // isInstrumentationEnabled returns whether there are any @@ -66,6 +69,8 @@ func (opt Options) ConfigKey() string { sb.WriteString(fmt.Sprintf("Instrumentation=%t ", isInstrumentationEnabled())) sb.WriteString(fmt.Sprintf("NVProxy=%t ", opt.NVProxy)) sb.WriteString(fmt.Sprintf("TPUProxy=%t ", opt.TPUProxy)) + sb.WriteString(fmt.Sprintf("CgoEnabled=%t ", opt.CgoEnabled)) + sb.WriteString(fmt.Sprintf("PluginNetwork=%t ", opt.PluginNetwork)) return strings.TrimSpace(sb.String()) } @@ -95,6 +100,12 @@ func Warnings(opt Options) []string { if opt.TPUProxy { warnings = append(warnings, "TPU device proxy enabled: syscall filters less restrictive!") } + if opt.CgoEnabled { + warnings = append(warnings, "CGO enabled: syscall filters less restrictive!") + } + if opt.PluginNetwork { + warnings = append(warnings, "plugin network stack enabled: syscall filters less restrictive!") + } return warnings } @@ -143,6 +154,12 @@ func rules(opt Options, vars precompiledseccomp.Values) (seccomp.SyscallRules, s s.Merge(accel.Filters()) s.Merge(tpuproxy.Filters()) } + if opt.CgoEnabled { + s.Merge(cgoFilters()) + } + if opt.PluginNetwork { + s.Merge(plugin.PluginFilters()) + } s.Merge(opt.Platform.SyscallFilters(vars)) return s, seccomp.DenyNewExecMappings diff --git a/runsc/boot/filter/config/config_cgo.go b/runsc/boot/filter/config/config_cgo.go new file mode 100644 index 0000000000..dac379a5e8 --- /dev/null +++ b/runsc/boot/filter/config/config_cgo.go @@ -0,0 +1,57 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package config + +import ( + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/seccomp" +) + +func cgoFilters() seccomp.SyscallRules { + return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ + unix.SYS_MMAP: seccomp.Or{ + seccomp.PerArg{ + seccomp.AnyValue{}, + seccomp.AnyValue{}, + seccomp.EqualTo(unix.PROT_NONE), + seccomp.EqualTo( + unix.MAP_PRIVATE | + unix.MAP_ANONYMOUS | + unix.MAP_NORESERVE), + }, + seccomp.PerArg{ + seccomp.AnyValue{}, + seccomp.AnyValue{}, + seccomp.AnyValue{}, + seccomp.EqualTo( + unix.MAP_PRIVATE | + unix.MAP_ANONYMOUS | + unix.MAP_STACK), + }, + }, + // The following three syscalls are needed for now since + // clone3() will be invoked by _cgo_sys_thread_start if cgo is + // imported during compiling the binary. + // However, clone3() is a wide API which should not be opened; but + // since the secomp filter we use now does not have a way to return + // a custom error code (e.g. ENOSYS), currently we have to open up + // these three syscalls. + // TODO(eperot): remove this syscall seccomp rule + unix.SYS_SET_ROBUST_LIST: seccomp.MatchAll{}, + // TODO(eperot): remove this syscall seccomp rule + unix.SYS_CLONE3: seccomp.MatchAll{}, + // TODO(eperot): remove this syscall seccomp rule + unix.SYS_RSEQ: seccomp.MatchAll{}, + }) +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index fe0760b339..9f0a1b1505 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -54,6 +54,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -811,6 +812,8 @@ func (l *Loader) installSeccompFilters() error { NVProxy: specutils.NVProxyEnabled(l.root.spec, l.root.conf), TPUProxy: specutils.TPUProxyIsEnabled(l.root.spec, l.root.conf), ControllerFD: uint32(l.ctrl.srv.FD()), + CgoEnabled: config.CgoEnabled, + PluginNetwork: l.root.conf.Network == config.NetworkPlugin, } if err := filter.Install(opts); err != nil { return fmt.Errorf("installing seccomp filters: %w", err) @@ -1446,6 +1449,8 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, userns *aut allowPacketEndpointWrite: conf.AllowPacketEndpointWrite, } return inet.NewRootNamespace(s, creator, userns), nil + case config.NetworkPlugin: + return inet.NewRootNamespace(plugin.GetPluginStack(), nil, userns), nil default: panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) diff --git a/runsc/boot/network.go b/runsc/boot/network.go index e8933550b1..08cecc9f43 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -21,12 +21,14 @@ import ( "os" "runtime" "strings" + "syscall" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/hostos" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" @@ -71,6 +73,10 @@ var ( type Network struct { Stack *stack.Stack Kernel *kernel.Kernel + + // PluginStack is a third-party network stack to use in place of + // netstack when non-nil. + PluginStack plugin.PluginStack } // Route represents a route in the network stack. @@ -184,6 +190,13 @@ type CreateLinksAndRoutesArgs struct { DisconnectOk bool } +// InitPluginStackArgs are arguments to InitPluginStack. +type InitPluginStackArgs struct { + urpc.FilePayload + + InitStr string +} + // IPWithPrefix is an address with its subnet prefix length. type IPWithPrefix struct { // Address is a network address. @@ -214,6 +227,31 @@ func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) { }, nil } +// InitPluginStack initializes plugin network stack. +// It will invoke Init() that is registered by current plugin stack. +func (n *Network) InitPluginStack(args *InitPluginStackArgs, _ *struct{}) error { + pluginStack := n.PluginStack + if pluginStack == nil { + return fmt.Errorf("plugin stack is not registered") + } + + fdNum := len(args.FilePayload.Files) + fds := make([]int, fdNum) + for i := 0; i < fdNum; i++ { + oldFD := args.FilePayload.Files[i].Fd() + if newFD, err := syscall.Dup(int(oldFD)); err != nil { + return fmt.Errorf("failed to dup FD") + } else { + fds[i] = newFD + } + } + + return pluginStack.Init(&plugin.InitStackArgs{ + InitStr: args.InitStr, + FDs: fds, + }) +} + // CreateLinksAndRoutes creates links and routes in a network stack. It should // only be called once. func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error { diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index f2bcb84df1..7ab3a9012b 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -95,6 +95,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", + "//pkg/sentry/socket/plugin", "//pkg/state/pretty", "//pkg/state/statefile", "//pkg/unet", diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 729839a67a..020ccd35aa 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -245,6 +245,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcomm UDSCreateEnabled: conf.GetHostUDS().AllowCreate(), ProfileEnabled: len(profileOpts) > 0, DirectFS: conf.DirectFS, + CgoEnabled: config.CgoEnabled, } if err := filter.Install(opts); err != nil { util.Fatalf("installing seccomp filters: %v", err) diff --git a/runsc/config/BUILD b/runsc/config/BUILD index 6b74717190..be63559e3d 100644 --- a/runsc/config/BUILD +++ b/runsc/config/BUILD @@ -8,6 +8,8 @@ package( go_library( name = "config", srcs = [ + "cgo_disabled.go", + "cgo_enabled.go", "config.go", "config_bundles.go", "flags.go", diff --git a/runsc/config/cgo_disabled.go b/runsc/config/cgo_disabled.go new file mode 100644 index 0000000000..ea7f4e21f7 --- /dev/null +++ b/runsc/config/cgo_disabled.go @@ -0,0 +1,20 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !cgo +// +build !cgo + +package config + +const CgoEnabled = false diff --git a/runsc/config/cgo_enabled.go b/runsc/config/cgo_enabled.go new file mode 100644 index 0000000000..0075e7b199 --- /dev/null +++ b/runsc/config/cgo_enabled.go @@ -0,0 +1,20 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build cgo +// +build cgo + +package config + +const CgoEnabled = true diff --git a/runsc/config/config.go b/runsc/config/config.go index 0b5bbe5ad4..d585a0037b 100644 --- a/runsc/config/config.go +++ b/runsc/config/config.go @@ -594,6 +594,9 @@ const ( // NetworkNone sets up just loopback using netstack. NetworkNone + + // NetworkPlugin uses third-party network stack. + NetworkPlugin ) func networkTypePtr(v NetworkType) *NetworkType { @@ -609,6 +612,8 @@ func (n *NetworkType) Set(v string) error { *n = NetworkHost case "none": *n = NetworkNone + case "plugin": + *n = NetworkPlugin default: return fmt.Errorf("invalid network type %q", v) } @@ -629,6 +634,8 @@ func (n NetworkType) String() string { return "host" case NetworkNone: return "none" + case NetworkPlugin: + return "plugin" } panic(fmt.Sprintf("Invalid network type %d", n)) } diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD index ca90e7d1a5..cfa07488db 100644 --- a/runsc/fsgofer/filter/BUILD +++ b/runsc/fsgofer/filter/BUILD @@ -11,6 +11,7 @@ go_library( "config.go", "config_amd64.go", "config_arm64.go", + "config_cgo.go", "config_profile.go", "extra_filters.go", "extra_filters_msan.go", @@ -27,6 +28,7 @@ go_library( "//pkg/flipcall", "//pkg/log", "//pkg/seccomp", + "//pkg/sentry/socket/plugin", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/fsgofer/filter/config_cgo.go b/runsc/fsgofer/filter/config_cgo.go new file mode 100644 index 0000000000..4c99c20510 --- /dev/null +++ b/runsc/fsgofer/filter/config_cgo.go @@ -0,0 +1,48 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package filter + +import ( + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/seccomp" +) + +var cgoFilters = seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ + unix.SYS_MMAP: seccomp.Or{ + seccomp.PerArg{ + seccomp.AnyValue{}, + seccomp.AnyValue{}, + seccomp.EqualTo(unix.PROT_NONE), + seccomp.EqualTo( + unix.MAP_PRIVATE | + unix.MAP_ANONYMOUS | + unix.MAP_NORESERVE), + }, + seccomp.PerArg{ + seccomp.AnyValue{}, + seccomp.AnyValue{}, + seccomp.AnyValue{}, + seccomp.EqualTo( + unix.MAP_PRIVATE | + unix.MAP_ANONYMOUS | + unix.MAP_STACK), + }, + }, + // TODO(eperot): remove this syscall seccomp rule + unix.SYS_SET_ROBUST_LIST: seccomp.MatchAll{}, + // TODO(eperot): remove this syscall seccomp rule + unix.SYS_CLONE3: seccomp.MatchAll{}, + // TODO(eperot): remove this syscall seccomp rule + unix.SYS_RSEQ: seccomp.MatchAll{}, +}) diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go index bafdfeb4a9..c2103a7f11 100644 --- a/runsc/fsgofer/filter/filter.go +++ b/runsc/fsgofer/filter/filter.go @@ -28,6 +28,7 @@ type Options struct { UDSCreateEnabled bool ProfileEnabled bool DirectFS bool + CgoEnabled bool } // Install installs seccomp filters. @@ -50,6 +51,11 @@ func Install(opt Options) error { } } + if opt.CgoEnabled { + report("CGO enabled: syscall filters less restrictive!") + s.Merge(cgoFilters) + } + // Set of additional filters used by -race and -msan. Returns empty // when not enabled. s.Merge(instrumentationFilters()) diff --git a/runsc/main_plugin_stack.go b/runsc/main_plugin_stack.go new file mode 100644 index 0000000000..b59c1a9c83 --- /dev/null +++ b/runsc/main_plugin_stack.go @@ -0,0 +1,34 @@ +// Copyright 2023 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.1 +// +build go1.1 + +// Binary runsc-plugin-stack implements the OCI runtime interface +// which supports using third-party network stack as plugin netstack. +package main + +import ( + _ "gvisor.dev/gvisor/pkg/sentry/socket/plugin/stack" + "gvisor.dev/gvisor/runsc/cli" + "gvisor.dev/gvisor/runsc/version" +) + +// version.Version is set dynamically, but needs to be +// linked in the binary, so reference it here. +var _ = version.Version() + +func main() { + cli.Main() +} diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 83ba8ba606..a43c8418d5 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -36,6 +36,7 @@ go_library( "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/seccheck", + "//pkg/sentry/socket/plugin", "//pkg/state/statefile", "//pkg/sync", "//pkg/tcpip/header", diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index d069325114..996b2f674e 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -27,6 +27,7 @@ import ( "github.com/vishvananda/netlink" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/socket/plugin" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" @@ -69,6 +70,10 @@ func setupNetwork(conn *urpc.Client, pid int, conf *config.Config) error { } case config.NetworkHost: // Nothing to do here. + case config.NetworkPlugin: + if err := initPluginStack(conn, pid, conf); err != nil { + return fmt.Errorf("failed to initialize external stack, error: %v", err) + } default: return fmt.Errorf("invalid network type: %v", conf.Network) } @@ -338,6 +343,30 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *con return nil } +func initPluginStack(conn *urpc.Client, pid int, conf *config.Config) error { + pluginStack := plugin.GetPluginStack() + if pluginStack == nil { + return fmt.Errorf("plugin stack is not registered") + } + + initStr, fds, err := pluginStack.PreInit(&plugin.PreInitStackArgs{Pid: pid}) + if err != nil { + return fmt.Errorf("plugin stack PreInit failed: %v", err) + } + var args boot.InitPluginStackArgs + args.InitStr = initStr + for _, fd := range fds { + args.FilePayload.Files = append(args.FilePayload.Files, os.NewFile(uintptr(fd), "")) + } + + log.Debugf("Initializing plugin network stack, config: %+v", args) + if err := conn.Call(boot.NetworkInitPluginStack, &args, nil); err != nil { + return fmt.Errorf("error initializing plugin netstack: %v", err) + } + + return nil +} + // isAddressOnInterface checks if an address is on an interface func isAddressOnInterface(ifaceName string, addr *net.IPNet) (bool, error) { iface, err := net.InterfaceByName(ifaceName) diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 76e582319e..9617963f79 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -891,6 +891,10 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1") } + if conf.Network == config.NetworkPlugin { + cmd.Env = append(cmd.Env, "GODEBUG=cgocheck=0") + } + // nss is the set of namespaces to join or create before starting the sandbox // process. Mount, IPC and UTS namespaces from the host are not used as they // are virtualized inside the sandbox. Be paranoid and run inside an empty diff --git a/tools/plugin-stack/plugin-stack.BUILD b/tools/plugin-stack/plugin-stack.BUILD new file mode 100644 index 0000000000..493b1d9b49 --- /dev/null +++ b/tools/plugin-stack/plugin-stack.BUILD @@ -0,0 +1,22 @@ +config_setting( + name = "plugin_tldk_condition", + values = {"define": "plugin_tldk=true"}, +) + +genrule( + name = "pluginstack_genrule", + outs = ["libpluginstack.a"], + local = 1, + cmd = select({ + # Support IVB and later machines. + ":plugin_tldk_condition": "git clone git@github.com:alipay/tldk.git; cd tldk; git checkout 9efb0dacb67da1da62ca78785e8cffb0c5a82785; make -j 1 DPDK_MACHINE=ivb EXTRA_CFLAGS='-g -O3 -fPIC -fno-omit-frame-pointer -DLOOK_ASIDE_BACKEND -Wno-error' all; cd ..; cp -f tldk/libtldk.a $(RULEDIR)/libpluginstack.a", + "//conditions:default": "", + }), + visibility = ["//visibility:public"], +) + +cc_library( + name = "libpluginstack", + srcs = ["libpluginstack.a"], + visibility = ["//visibility:public"], +) diff --git a/tools/rules_cgo.patch b/tools/rules_cgo.patch new file mode 100644 index 0000000000..fb916bd650 --- /dev/null +++ b/tools/rules_cgo.patch @@ -0,0 +1,17 @@ +diff --git a/go/private/rules/cgo.bzl b/go/private/rules/cgo.bzl +index b8fc93a6..3fec27e4 100644 +--- a/go/private/rules/cgo.bzl ++++ b/go/private/rules/cgo.bzl +@@ -144,7 +144,12 @@ def cgo_configure(go, srcs, cdeps, cppopts, copts, cxxopts, clinkopts): + # libclntsh.dylib.12.1, users have to create a unversioned symbolic link, + # so it can be treated as a simple shared library too. + continue ++ # Make each object in the C libraries after this line to be included in the resulting ++ # binary object to guarantee necessary symbols (i.e. DPDK drivers) can be found. ++ lib_opts.append("-Wl,--whole-archive") + lib_opts.append(lib.path) ++ # Disable whole-archive for other libraries. ++ lib_opts.append("-Wl,--no-whole-archive") + clinkopts.extend(cc_link_flags) + + elif hasattr(d, "objc"): \ No newline at end of file