diff --git a/Makefile b/Makefile index 28d70672c..3f81da9c9 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,6 @@ DOC_FILES := \ runtime.md \ runtime-linux.md \ config.md \ - config-linux.md \ runtime-config.md \ runtime-config-linux.md \ glossary.md diff --git a/config-linux.md b/config-linux.md deleted file mode 100644 index 7049e95e8..000000000 --- a/config-linux.md +++ /dev/null @@ -1,39 +0,0 @@ -# Linux-specific Container Configuration - -The Linux container specification uses various kernel features like namespaces, cgroups, capabilities, LSM, and file system jails to fulfill the spec. -Additional information is needed for Linux over the [default spec configuration](config.md) in order to configure these various kernel features. - -## Capabilities - -Capabilities is an array that specifies Linux capabilities that can be provided to the process inside the container. -Valid values are the strings for capabilities defined in [the man page](http://man7.org/linux/man-pages/man7/capabilities.7.html) - -```json - "capabilities": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ] -``` - -## Default Devices and File Systems - -The Linux ABI includes both syscalls and several special file paths. -Applications expecting a Linux environment will very likely expect these files paths to be setup correctly. - -The following devices and filesystems MUST be made available in each application's filesystem - -| Path | Type | Notes | -| ------------ | ------ | ------- | -| /proc | [procfs](https://www.kernel.org/doc/Documentation/filesystems/proc.txt) | | -| /sys | [sysfs](https://www.kernel.org/doc/Documentation/filesystems/sysfs.txt) | | -| /dev/null | [device](http://man7.org/linux/man-pages/man4/null.4.html) | | -| /dev/zero | [device](http://man7.org/linux/man-pages/man4/zero.4.html) | | -| /dev/full | [device](http://man7.org/linux/man-pages/man4/full.4.html) | | -| /dev/random | [device](http://man7.org/linux/man-pages/man4/random.4.html) | | -| /dev/urandom | [device](http://man7.org/linux/man-pages/man4/random.4.html) | | -| /dev/tty | [device](http://man7.org/linux/man-pages/man4/tty.4.html) | | -| /dev/console | [device](http://man7.org/linux/man-pages/man4/console.4.html) | | -| /dev/pts | [devpts](https://www.kernel.org/doc/Documentation/filesystems/devpts.txt) | | -| /dev/ptmx | [device](https://www.kernel.org/doc/Documentation/filesystems/devpts.txt) | Bind-mount or symlink of /dev/pts/ptmx | -| /dev/shm | [tmpfs](https://www.kernel.org/doc/Documentation/filesystems/tmpfs.txt) | | diff --git a/config.go b/config.go index 1a0678b50..f8af651b6 100644 --- a/config.go +++ b/config.go @@ -16,6 +16,9 @@ type Spec struct { Hostname string `json:"hostname,omitempty"` // Mounts profile configuration for adding mounts to the container's filesystem. Mounts []MountPoint `json:"mounts"` + + // Linux is platform specific configuration for linux based containers. (this field is platform dependent) + Linux Linux `json:"linux,omitempty" platform:"linux"` } // Process contains information to start a specific application inside the container. @@ -33,6 +36,17 @@ type Process struct { Cwd string `json:"cwd"` } +// User specifies linux specific user and group information for the container's +// main process. +type User struct { + // UID is the user ID the Process is executed as. (this field is platform dependent) + UID uint32 `json:"uid,omitempty" platform:"linux"` + // GID is the group ID the Process is executed as. (this field is platform dependent) + GID uint32 `json:"gid,omitempty" platform:"linux"` + // AdditionalGids are additional group ids set for the container's process. (this field is platform dependent) + AdditionalGids []uint32 `json:"additionalGids,omitempty" platform:"linux"` +} + // Root contains information about the container's root filesystem on the host. type Root struct { // Path is the absolute path to the container's root filesystem. @@ -57,3 +71,9 @@ type MountPoint struct { // Path specifies the path of the mount. The path and child directories MUST exist, a runtime MUST NOT create directories automatically to a mount point. Path string `json:"path"` } + +// Linux contains platform specific configuration for linux based containers. +type Linux struct { + // Capabilities are linux capabilities that are kept for the container. + Capabilities []string `json:"capabilities"` +} diff --git a/config.md b/config.md index ae28d98d6..e52ee5da5 100644 --- a/config.md +++ b/config.md @@ -28,10 +28,10 @@ Each container has exactly one *root filesystem*, specified in the *root* object *Example* ```json -"root": { + "root": { "path": "rootfs", "readonly": true -} + } ``` ## Mount Points @@ -46,24 +46,24 @@ The runtime MUST mount entries in the listed order. *Example* ```json -"mounts": [ + "mounts": [ { - "name": "proc", - "path": "/proc" + "name": "proc", + "path": "/proc" }, { - "name": "dev", - "path": "/dev" + "name": "dev", + "path": "/dev" }, { - "name": "devpts", - "path": "/dev/pts" + "name": "devpts", + "path": "/dev/pts" }, { - "name": "data", - "path": "/data" + "name": "data", + "path": "/data" } -] + ] ``` ## Process configuration @@ -74,11 +74,11 @@ The runtime MUST mount entries in the listed order. * **`args`** (string, required) executable to launch and any flags as an array. The executable is the first element and must be available at the given path inside of the rootfs. If the executable path is not an absolute path then the search $PATH is interpreted to find the executable. The user for the process is a platform-specific structure that allows specific control over which user the process runs as. -For Linux-based systems the user structure has the following fields: +The user structure has the following fields: -* **`uid`** (int, required) specifies the user id. -* **`gid`** (int, required) specifies the group id. -* **`additionalGids`** (array of ints, optional) specifies additional group ids to be added to the process. +* **`uid`** (int, required on Linux) specifies the user id. +* **`gid`** (int, required on Linux) specifies the group id. +* **`additionalGids`** (array of ints, optional on Linux) specifies additional group ids to be added to the process. *Example (Linux)* @@ -101,7 +101,6 @@ For Linux-based systems the user structure has the following fields: } ``` - ## Hostname * **`hostname`** (string, optional) as it is accessible to processes running inside. On Linux, you can only set this if your bundle creates a new [UTS namespace][uts-namespace]. @@ -117,6 +116,8 @@ For Linux-based systems the user structure has the following fields: * **`os`** (string, required) specifies the operating system family this image must run on. Values for os must be in the list specified by the Go Language document for [`$GOOS`](https://golang.org/doc/install/source#environment). * **`arch`** (string, required) specifies the instruction set for which the binaries in the image have been compiled. Values for arch must be in the list specified by the Go Language document for [`$GOARCH`](https://golang.org/doc/install/source#environment). +*Example* + ```json "platform": { "os": "linux", @@ -125,6 +126,46 @@ For Linux-based systems the user structure has the following fields: ``` Interpretation of the platform section of the JSON file is used to find which platform-specific sections may be available in the document. -For example, if `os` is set to `linux`, then a JSON object conforming to the [Linux-specific schema](config-linux.md) SHOULD be found at the key `linux` in the `config.json`. +For example, if `os` is set to `linux`, then a JSON object conforming to the [Linux-specific schema](#linux-specific-container-configuration) SHOULD be found at the key `linux` in the `config.json`. + +## Linux-specific Container Configuration + +The Linux container specification. +uses various kernel features like namespaces, cgroups, capabilities, LSM, and file system jails to fulfill the spec. + +### Capabilities + +Capabilities is an array that specifies Linux capabilities that can be provided to the process inside the container. +Valid values are the strings for capabilities defined in [the man page](http://man7.org/linux/man-pages/man7/capabilities.7.html) + +```json + "capabilities": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ] +``` + +### Default Devices and File Systems + +The Linux ABI includes both syscalls and several special file paths. +Applications expecting a Linux environment will very likely expect these files paths to be setup correctly. + +The following devices and filesystems MUST be made available in each application's filesystem + +| Path | Type | Notes | +| ------------ | ------ | ------- | +| /proc | [procfs](https://www.kernel.org/doc/Documentation/filesystems/proc.txt) | | +| /sys | [sysfs](https://www.kernel.org/doc/Documentation/filesystems/sysfs.txt) | | +| /dev/null | [device](http://man7.org/linux/man-pages/man4/null.4.html) | | +| /dev/zero | [device](http://man7.org/linux/man-pages/man4/zero.4.html) | | +| /dev/full | [device](http://man7.org/linux/man-pages/man4/full.4.html) | | +| /dev/random | [device](http://man7.org/linux/man-pages/man4/random.4.html) | | +| /dev/urandom | [device](http://man7.org/linux/man-pages/man4/random.4.html) | | +| /dev/tty | [device](http://man7.org/linux/man-pages/man4/tty.4.html) | | +| /dev/console | [device](http://man7.org/linux/man-pages/man4/console.4.html) | | +| /dev/pts | [devpts](https://www.kernel.org/doc/Documentation/filesystems/devpts.txt) | | +| /dev/ptmx | [device](https://www.kernel.org/doc/Documentation/filesystems/devpts.txt) | Bind-mount or symlink of /dev/pts/ptmx | +| /dev/shm | [tmpfs](https://www.kernel.org/doc/Documentation/filesystems/tmpfs.txt) | | [uts-namespace]: http://man7.org/linux/man-pages/man7/namespaces.7.html diff --git a/config_linux.go b/config_linux.go deleted file mode 100644 index a715bb567..000000000 --- a/config_linux.go +++ /dev/null @@ -1,25 +0,0 @@ -package specs - -// LinuxSpec is the full specification for linux containers. -type LinuxSpec struct { - Spec - // Linux is platform specific configuration for linux based containers. - Linux Linux `json:"linux"` -} - -// Linux contains platform specific configuration for linux based containers. -type Linux struct { - // Capabilities are linux capabilities that are kept for the container. - Capabilities []string `json:"capabilities"` -} - -// User specifies linux specific user and group information for the container's -// main process. -type User struct { - // UID is the user id. - UID uint32 `json:"uid"` - // GID is the group id. - GID uint32 `json:"gid"` - // AdditionalGids are additional group ids set for the container's process. - AdditionalGids []uint32 `json:"additionalGids,omitempty"` -} diff --git a/runtime-config-linux.md b/runtime-config-linux.md index 4d2f9adfe..a2923c0a1 100644 --- a/runtime-config-linux.md +++ b/runtime-config-linux.md @@ -22,7 +22,7 @@ The following parameters can be specified to setup namespaces: If a path is specified, that particular file is used to join that type of namespace. Also, when a path is specified, a runtime MUST assume that the setup for that particular namespace has already been done and error out if the config specifies anything else related to that namespace. -###### Example +*Example* ```json "namespaces": [ @@ -51,7 +51,7 @@ Also, when a path is specified, a runtime MUST assume that the setup for that pa ## User namespace mappings -###### Example +*Example* ```json "uidMappings": [ @@ -97,7 +97,7 @@ The following parameters can be specified: **`fileMode`**, **`uid`** and **`gid`** are required if **`path`** is given and are otherwise not allowed. -###### Example +*Example* ```json "devices": [ @@ -178,7 +178,7 @@ The Spec does not include naming schema for cgroups. The Spec does not support [split hierarchy](https://www.kernel.org/doc/Documentation/cgroups/unified-hierarchy.txt). The cgroups will be created if they don't exist. -###### Example +*Example* ```json "cgroupsPath": "/myRuntime/myContainer" @@ -200,7 +200,7 @@ For more information, see [the memory cgroup man page](https://www.kernel.org/do * **`disableOOMKiller`** *(bool, optional)* - enables or disables the OOM killer -###### Example +*Example* ```json "disableOOMKiller": false @@ -215,9 +215,7 @@ For more information on how these two settings work together, see [the memory cg * **`oomScoreAdj`** *(int, optional)* - adjust the oom-killer score -###### Example - -###### Example +*Example* ```json "oomScoreAdj": 0 @@ -242,7 +240,7 @@ The following parameters can be specified to setup the controller: * **`swappiness`** *(uint64, optional)* - sets swappiness parameter of vmscan (See sysctl's vm.swappiness) -###### Example +*Example* ```json "memory": { @@ -276,7 +274,7 @@ The following parameters can be specified to setup the controller: * **`mems`** *(string, optional)* - list of Memory Nodes the container will run in -###### Example +*Example* ```json "cpu": { @@ -312,7 +310,7 @@ The following parameters can be specified to setup the controller: * **`major, minor`** *(int64, required)* - major, minor numbers for device. More info in `man mknod`. * **`rate`** *(uint64, required)* - IO rate limit for the device -###### Example +*Example* ```json "blockIO": { @@ -360,7 +358,7 @@ For more information, see the [kernel cgroups documentation about HugeTLB](https * **`limit`** *(uint64, required)* - limit in bytes of *hugepagesize* HugeTLB usage -###### Example +*Example* ```json "hugepageLimits": [ @@ -385,7 +383,7 @@ processes in the group and egressing the system on various interfaces. The follo * **`name`** *(string, required)* - interface name * **`priority`** *(uint32, required)* - priority applied to the interface -###### Example +*Example* ```json "network": { @@ -413,7 +411,7 @@ The following paramters can be specified to setup the controller: * **`limit`** *(int64, required)* - specifies the maximum number of tasks in the cgroup -###### Example +*Example* ```json "pids": { @@ -426,7 +424,7 @@ The following paramters can be specified to setup the controller: sysctl allows kernel parameters to be modified at runtime for the container. For more information, see [the man page](http://man7.org/linux/man-pages/man8/sysctl.8.html) -###### Example +*Example* ```json "sysctl": { @@ -441,7 +439,7 @@ rlimits allow setting resource limits. `type` is a string with a value from those defined in [the man page](http://man7.org/linux/man-pages/man2/setrlimit.2.html). The kernel enforces the `soft` limit for a resource while the `hard` limit acts as a ceiling for that value that could be set by an unprivileged process. -###### Example +*Example* ```json "rlimits": [ @@ -458,7 +456,7 @@ The kernel enforces the `soft` limit for a resource while the `hard` limit acts SELinux process label specifies the label with which the processes in a container are run. For more information about SELinux, see [Selinux documentation](http://selinuxproject.org/page/Main_Page) -###### Example +*Example* ```json "selinuxProcessLabel": "system_u:system_r:svirt_lxc_net_t:s0:c124,c675" @@ -469,7 +467,7 @@ For more information about SELinux, see [Selinux documentation](http://selinuxp Apparmor profile specifies the name of the apparmor profile that will be used for the container. For more information about Apparmor, see [Apparmor documentation](https://wiki.ubuntu.com/AppArmor) -###### Example +*Example* ```json "apparmorProfile": "acme_secure_profile" @@ -512,7 +510,7 @@ Operator Constants: * `SCMP_CMP_GT` * `SCMP_CMP_MASKED_EQ` -###### Example +*Example* ```json "seccomp": { @@ -535,7 +533,7 @@ rootfsPropagation sets the rootfs's mount propagation. Its value is either slave, private, or shared. [The kernel doc](https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt) has more information about mount propagation. -###### Example +*Example* ```json "rootfsPropagation": "slave", diff --git a/runtime_config.go b/runtime_config.go index 931f912c4..24ede6606 100644 --- a/runtime_config.go +++ b/runtime_config.go @@ -1,5 +1,7 @@ package specs +import "os" + // RuntimeSpec contains host-specific configuration information for // a container. This information must not be included when the bundle // is packaged for distribution. @@ -10,6 +12,9 @@ type RuntimeSpec struct { Mounts map[string]Mount `json:"mounts"` // Hooks are the commands run at various lifecycle events of the container. Hooks Hooks `json:"hooks"` + + // LinuxRuntime is platform specific configuration for linux based containers. (this field is platform dependent) + Linux *LinuxRuntime `json:"linux,omitempty"` } // Hook specifies a command that is run at a particular event in the lifecycle of a container @@ -40,3 +45,297 @@ type Mount struct { // Options are fstab style mount options. Options []string `json:"options,omitempty"` } + +// LinuxStateDirectory holds the container's state information +const LinuxStateDirectory = "/run/opencontainer/containers" + +// LinuxRuntime hosts the Linux-only runtime information +type LinuxRuntime struct { + // UIDMapping specifies user mappings for supporting user namespaces on linux. + UIDMappings []IDMapping `json:"uidMappings,omitempty"` + // GIDMapping specifies group mappings for supporting user namespaces on linux. + GIDMappings []IDMapping `json:"gidMappings,omitempty"` + // Rlimits specifies rlimit options to apply to the container's process. + Rlimits []Rlimit `json:"rlimits,omitempty"` + // Sysctl are a set of key value pairs that are set for the container on start + Sysctl map[string]string `json:"sysctl,omitempty"` + // Resources contain cgroup information for handling resource constraints + // for the container + Resources *Resources `json:"resources,omitempty"` + // CgroupsPath specifies the path to cgroups that are created and/or joined by the container. + // The path is expected to be relative to the cgroups mountpoint. + // If resources are specified, the cgroups at CgroupsPath will be updated based on resources. + CgroupsPath *string `json:"cgroupsPath,omitempty"` + // Namespaces contains the namespaces that are created and/or joined by the container + Namespaces []Namespace `json:"namespaces"` + // Devices are a list of device nodes that are created and enabled for the container + Devices []Device `json:"devices"` + // ApparmorProfile specified the apparmor profile for the container. + ApparmorProfile string `json:"apparmorProfile"` + // SelinuxProcessLabel specifies the selinux context that the container process is run as. + SelinuxProcessLabel string `json:"selinuxProcessLabel"` + // Seccomp specifies the seccomp security settings for the container. + Seccomp Seccomp `json:"seccomp"` + // RootfsPropagation is the rootfs mount propagation mode for the container + RootfsPropagation string `json:"rootfsPropagation,omitempty"` +} + +// Namespace is the configuration for a linux namespace +type Namespace struct { + // Type is the type of Linux namespace + Type NamespaceType `json:"type"` + // Path is a path to an existing namespace persisted on disk that can be joined + // and is of the same type + Path string `json:"path,omitempty"` +} + +// NamespaceType is one of the linux namespaces +type NamespaceType string + +const ( + // PIDNamespace for isolating process IDs + PIDNamespace NamespaceType = "pid" + // NetworkNamespace for isolating network devices, stacks, ports, etc + NetworkNamespace = "network" + // MountNamespace for isolating mount points + MountNamespace = "mount" + // IPCNamespace for isolating System V IPC, POSIX message queues + IPCNamespace = "ipc" + // UTSNamespace for isolating hostname and NIS domain name + UTSNamespace = "uts" + // UserNamespace for isolating user and group IDs + UserNamespace = "user" +) + +// IDMapping specifies UID/GID mappings +type IDMapping struct { + // HostID is the UID/GID of the host user or group + HostID uint32 `json:"hostID"` + // ContainerID is the UID/GID of the container's user or group + ContainerID uint32 `json:"containerID"` + // Size is the length of the range of IDs mapped between the two namespaces + Size uint32 `json:"size"` +} + +// Rlimit type and restrictions +type Rlimit struct { + // Type of the rlimit to set + Type string `json:"type"` + // Hard is the hard limit for the specified type + Hard uint64 `json:"hard"` + // Soft is the soft limit for the specified type + Soft uint64 `json:"soft"` +} + +// HugepageLimit structure corresponds to limiting kernel hugepages +type HugepageLimit struct { + // Pagesize is the hugepage size + Pagesize *string `json:"pageSize,omitempty"` + // Limit is the limit of "hugepagesize" hugetlb usage + Limit *uint64 `json:"limit,omitempty"` +} + +// InterfacePriority for network interfaces +type InterfacePriority struct { + // Name is the name of the network interface + Name string `json:"name"` + // Priority for the interface + Priority uint32 `json:"priority"` +} + +// blockIODevice holds major:minor format supported in blkio cgroup +type blockIODevice struct { + // Major is the device's major number. + Major int64 `json:"major"` + // Minor is the device's minor number. + Minor int64 `json:"minor"` +} + +// WeightDevice struct holds a `major:minor weight` pair for blkioWeightDevice +type WeightDevice struct { + blockIODevice + // Weight is the bandwidth rate for the device, range is from 10 to 1000 + Weight *uint16 `json:"weight,omitempty"` + // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, CFQ scheduler only + LeafWeight *uint16 `json:"leafWeight,omitempty"` +} + +// ThrottleDevice struct holds a `major:minor rate_per_second` pair +type ThrottleDevice struct { + blockIODevice + // Rate is the IO rate limit per cgroup per device + Rate *uint64 `json:"rate,omitempty"` +} + +// BlockIO for Linux cgroup 'blkio' resource management +type BlockIO struct { + // Specifies per cgroup weight, range is from 10 to 1000 + Weight *uint16 `json:"blkioWeight,omitempty"` + // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, CFQ scheduler only + LeafWeight *uint16 `json:"blkioLeafWeight,omitempty"` + // Weight per cgroup per device, can override BlkioWeight + WeightDevice []*WeightDevice `json:"blkioWeightDevice,omitempty"` + // IO read rate limit per cgroup per device, bytes per second + ThrottleReadBpsDevice []*ThrottleDevice `json:"blkioThrottleReadBpsDevice,omitempty"` + // IO write rate limit per cgroup per device, bytes per second + ThrottleWriteBpsDevice []*ThrottleDevice `json:"blkioThrottleWriteBpsDevice,omitempty"` + // IO read rate limit per cgroup per device, IO per second + ThrottleReadIOPSDevice []*ThrottleDevice `json:"blkioThrottleReadIOPSDevice,omitempty"` + // IO write rate limit per cgroup per device, IO per second + ThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkioThrottleWriteIOPSDevice,omitempty"` +} + +// Memory for Linux cgroup 'memory' resource management +type Memory struct { + // Memory limit (in bytes). + Limit *uint64 `json:"limit,omitempty"` + // Memory reservation or soft_limit (in bytes). + Reservation *uint64 `json:"reservation,omitempty"` + // Total memory limit (memory + swap). + Swap *uint64 `json:"swap,omitempty"` + // Kernel memory limit (in bytes). + Kernel *uint64 `json:"kernel,omitempty"` + // Kernel memory limit for tcp (in bytes) + KernelTCP *uint64 `json:"kernelTCP"` + // How aggressive the kernel will swap memory pages. Range from 0 to 100. + Swappiness *uint64 `json:"swappiness,omitempty"` +} + +// CPU for Linux cgroup 'cpu' resource management +type CPU struct { + // CPU shares (relative weight (ratio) vs. other cgroups with cpu shares). + Shares *uint64 `json:"shares,omitempty"` + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + Quota *uint64 `json:"quota,omitempty"` + // CPU period to be used for hardcapping (in usecs). + Period *uint64 `json:"period,omitempty"` + // How much time realtime scheduling may use (in usecs). + RealtimeRuntime *uint64 `json:"realtimeRuntime,omitempty"` + // CPU period to be used for realtime scheduling (in usecs). + RealtimePeriod *uint64 `json:"realtimePeriod,omitempty"` + // CPUs to use within the cpuset. Default is to use any CPU available. + Cpus *string `json:"cpus,omitempty"` + // List of memory nodes in the cpuset. Default is to use any available memory node. + Mems *string `json:"mems,omitempty"` +} + +// Pids for Linux cgroup 'pids' resource management (Linux 4.3) +type Pids struct { + // Maximum number of PIDs. Default is "no limit". + Limit *int64 `json:"limit,omitempty"` +} + +// Network identification and priority configuration +type Network struct { + // Set class identifier for container's network packets + ClassID *uint32 `json:"classID"` + // Set priority of network traffic for container + Priorities []InterfacePriority `json:"priorities"` +} + +// Resources has container runtime resource constraints +type Resources struct { + // DisableOOMKiller disables the OOM killer for out of memory conditions + DisableOOMKiller *bool `json:"disableOOMKiller,omitempty"` + // Specify an oom_score_adj for the container. + OOMScoreAdj *int `json:"oomScoreAdj,omitempty"` + // Memory restriction configuration + Memory *Memory `json:"memory,omitempty"` + // CPU resource restriction configuration + CPU *CPU `json:"cpu,omitempty"` + // Task resource restriction configuration. + Pids *Pids `json:"pids,omitempty"` + // BlockIO restriction configuration + BlockIO *BlockIO `json:"blockIO,omitempty"` + // Hugetlb limit (in bytes) + HugepageLimits []HugepageLimit `json:"hugepageLimits,omitempty"` + // Network restriction configuration + Network *Network `json:"network,omitempty"` +} + +// Device represents the information on a Linux special device file +type Device struct { + // Path to the device. + Path string `json:"path"` + // Device type, block, char, etc. + Type rune `json:"type"` + // Major is the device's major number. + Major int64 `json:"major"` + // Minor is the device's minor number. + Minor int64 `json:"minor"` + // Cgroup permissions format, rwm. + Permissions string `json:"permissions"` + // FileMode permission bits for the device. + FileMode os.FileMode `json:"fileMode"` + // UID of the device. + UID uint32 `json:"uid"` + // Gid of the device. + GID uint32 `json:"gid"` +} + +// Seccomp represents syscall restrictions +type Seccomp struct { + DefaultAction Action `json:"defaultAction"` + Architectures []Arch `json:"architectures"` + Syscalls []*Syscall `json:"syscalls"` +} + +// Arch used for additional architectures +type Arch string + +// Additional architectures permitted to be used for system calls +// By default only the native architecture of the kernel is permitted +const ( + ArchX86 Arch = "SCMP_ARCH_X86" + ArchX86_64 Arch = "SCMP_ARCH_X86_64" + ArchX32 Arch = "SCMP_ARCH_X32" + ArchARM Arch = "SCMP_ARCH_ARM" + ArchAARCH64 Arch = "SCMP_ARCH_AARCH64" + ArchMIPS Arch = "SCMP_ARCH_MIPS" + ArchMIPS64 Arch = "SCMP_ARCH_MIPS64" + ArchMIPS64N32 Arch = "SCMP_ARCH_MIPS64N32" + ArchMIPSEL Arch = "SCMP_ARCH_MIPSEL" + ArchMIPSEL64 Arch = "SCMP_ARCH_MIPSEL64" + ArchMIPSEL64N32 Arch = "SCMP_ARCH_MIPSEL64N32" +) + +// Action taken upon Seccomp rule match +type Action string + +// Define actions for Seccomp rules +const ( + ActKill Action = "SCMP_ACT_KILL" + ActTrap Action = "SCMP_ACT_TRAP" + ActErrno Action = "SCMP_ACT_ERRNO" + ActTrace Action = "SCMP_ACT_TRACE" + ActAllow Action = "SCMP_ACT_ALLOW" +) + +// Operator used to match syscall arguments in Seccomp +type Operator string + +// Define operators for syscall arguments in Seccomp +const ( + OpNotEqual Operator = "SCMP_CMP_NE" + OpLessThan Operator = "SCMP_CMP_LT" + OpLessEqual Operator = "SCMP_CMP_LE" + OpEqualTo Operator = "SCMP_CMP_EQ" + OpGreaterEqual Operator = "SCMP_CMP_GE" + OpGreaterThan Operator = "SCMP_CMP_GT" + OpMaskedEqual Operator = "SCMP_CMP_MASKED_EQ" +) + +// Arg used for matching specific syscall arguments in Seccomp +type Arg struct { + Index uint `json:"index"` + Value uint64 `json:"value"` + ValueTwo uint64 `json:"valueTwo"` + Op Operator `json:"op"` +} + +// Syscall is used to match a syscall in Seccomp +type Syscall struct { + Name string `json:"name"` + Action Action `json:"action"` + Args []*Arg `json:"args"` +} diff --git a/runtime_config_linux.go b/runtime_config_linux.go deleted file mode 100644 index b32a21616..000000000 --- a/runtime_config_linux.go +++ /dev/null @@ -1,304 +0,0 @@ -package specs - -import "os" - -// LinuxStateDirectory holds the container's state information -const LinuxStateDirectory = "/run/opencontainer/containers" - -// LinuxRuntimeSpec is the full specification for linux containers. -type LinuxRuntimeSpec struct { - RuntimeSpec - // LinuxRuntime is platform specific configuration for linux based containers. - Linux LinuxRuntime `json:"linux"` -} - -// LinuxRuntime hosts the Linux-only runtime information -type LinuxRuntime struct { - // UIDMapping specifies user mappings for supporting user namespaces on linux. - UIDMappings []IDMapping `json:"uidMappings,omitempty"` - // GIDMapping specifies group mappings for supporting user namespaces on linux. - GIDMappings []IDMapping `json:"gidMappings,omitempty"` - // Rlimits specifies rlimit options to apply to the container's process. - Rlimits []Rlimit `json:"rlimits,omitempty"` - // Sysctl are a set of key value pairs that are set for the container on start - Sysctl map[string]string `json:"sysctl,omitempty"` - // Resources contain cgroup information for handling resource constraints - // for the container - Resources *Resources `json:"resources,omitempty"` - // CgroupsPath specifies the path to cgroups that are created and/or joined by the container. - // The path is expected to be relative to the cgroups mountpoint. - // If resources are specified, the cgroups at CgroupsPath will be updated based on resources. - CgroupsPath *string `json:"cgroupsPath,omitempty"` - // Namespaces contains the namespaces that are created and/or joined by the container - Namespaces []Namespace `json:"namespaces"` - // Devices are a list of device nodes that are created and enabled for the container - Devices []Device `json:"devices"` - // ApparmorProfile specified the apparmor profile for the container. - ApparmorProfile string `json:"apparmorProfile"` - // SelinuxProcessLabel specifies the selinux context that the container process is run as. - SelinuxProcessLabel string `json:"selinuxProcessLabel"` - // Seccomp specifies the seccomp security settings for the container. - Seccomp Seccomp `json:"seccomp"` - // RootfsPropagation is the rootfs mount propagation mode for the container - RootfsPropagation string `json:"rootfsPropagation,omitempty"` -} - -// Namespace is the configuration for a linux namespace -type Namespace struct { - // Type is the type of Linux namespace - Type NamespaceType `json:"type"` - // Path is a path to an existing namespace persisted on disk that can be joined - // and is of the same type - Path string `json:"path,omitempty"` -} - -// NamespaceType is one of the linux namespaces -type NamespaceType string - -const ( - // PIDNamespace for isolating process IDs - PIDNamespace NamespaceType = "pid" - // NetworkNamespace for isolating network devices, stacks, ports, etc - NetworkNamespace = "network" - // MountNamespace for isolating mount points - MountNamespace = "mount" - // IPCNamespace for isolating System V IPC, POSIX message queues - IPCNamespace = "ipc" - // UTSNamespace for isolating hostname and NIS domain name - UTSNamespace = "uts" - // UserNamespace for isolating user and group IDs - UserNamespace = "user" -) - -// IDMapping specifies UID/GID mappings -type IDMapping struct { - // HostID is the UID/GID of the host user or group - HostID uint32 `json:"hostID"` - // ContainerID is the UID/GID of the container's user or group - ContainerID uint32 `json:"containerID"` - // Size is the length of the range of IDs mapped between the two namespaces - Size uint32 `json:"size"` -} - -// Rlimit type and restrictions -type Rlimit struct { - // Type of the rlimit to set - Type string `json:"type"` - // Hard is the hard limit for the specified type - Hard uint64 `json:"hard"` - // Soft is the soft limit for the specified type - Soft uint64 `json:"soft"` -} - -// HugepageLimit structure corresponds to limiting kernel hugepages -type HugepageLimit struct { - // Pagesize is the hugepage size - Pagesize *string `json:"pageSize,omitempty"` - // Limit is the limit of "hugepagesize" hugetlb usage - Limit *uint64 `json:"limit,omitempty"` -} - -// InterfacePriority for network interfaces -type InterfacePriority struct { - // Name is the name of the network interface - Name string `json:"name"` - // Priority for the interface - Priority uint32 `json:"priority"` -} - -// blockIODevice holds major:minor format supported in blkio cgroup -type blockIODevice struct { - // Major is the device's major number. - Major int64 `json:"major"` - // Minor is the device's minor number. - Minor int64 `json:"minor"` -} - -// WeightDevice struct holds a `major:minor weight` pair for blkioWeightDevice -type WeightDevice struct { - blockIODevice - // Weight is the bandwidth rate for the device, range is from 10 to 1000 - Weight *uint16 `json:"weight,omitempty"` - // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, CFQ scheduler only - LeafWeight *uint16 `json:"leafWeight,omitempty"` -} - -// ThrottleDevice struct holds a `major:minor rate_per_second` pair -type ThrottleDevice struct { - blockIODevice - // Rate is the IO rate limit per cgroup per device - Rate *uint64 `json:"rate,omitempty"` -} - -// BlockIO for Linux cgroup 'blkio' resource management -type BlockIO struct { - // Specifies per cgroup weight, range is from 10 to 1000 - Weight *uint16 `json:"blkioWeight,omitempty"` - // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, CFQ scheduler only - LeafWeight *uint16 `json:"blkioLeafWeight,omitempty"` - // Weight per cgroup per device, can override BlkioWeight - WeightDevice []*WeightDevice `json:"blkioWeightDevice,omitempty"` - // IO read rate limit per cgroup per device, bytes per second - ThrottleReadBpsDevice []*ThrottleDevice `json:"blkioThrottleReadBpsDevice,omitempty"` - // IO write rate limit per cgroup per device, bytes per second - ThrottleWriteBpsDevice []*ThrottleDevice `json:"blkioThrottleWriteBpsDevice,omitempty"` - // IO read rate limit per cgroup per device, IO per second - ThrottleReadIOPSDevice []*ThrottleDevice `json:"blkioThrottleReadIOPSDevice,omitempty"` - // IO write rate limit per cgroup per device, IO per second - ThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkioThrottleWriteIOPSDevice,omitempty"` -} - -// Memory for Linux cgroup 'memory' resource management -type Memory struct { - // Memory limit (in bytes). - Limit *uint64 `json:"limit,omitempty"` - // Memory reservation or soft_limit (in bytes). - Reservation *uint64 `json:"reservation,omitempty"` - // Total memory limit (memory + swap). - Swap *uint64 `json:"swap,omitempty"` - // Kernel memory limit (in bytes). - Kernel *uint64 `json:"kernel,omitempty"` - // Kernel memory limit for tcp (in bytes) - KernelTCP *uint64 `json:"kernelTCP"` - // How aggressive the kernel will swap memory pages. Range from 0 to 100. - Swappiness *uint64 `json:"swappiness,omitempty"` -} - -// CPU for Linux cgroup 'cpu' resource management -type CPU struct { - // CPU shares (relative weight (ratio) vs. other cgroups with cpu shares). - Shares *uint64 `json:"shares,omitempty"` - // CPU hardcap limit (in usecs). Allowed cpu time in a given period. - Quota *uint64 `json:"quota,omitempty"` - // CPU period to be used for hardcapping (in usecs). - Period *uint64 `json:"period,omitempty"` - // How much time realtime scheduling may use (in usecs). - RealtimeRuntime *uint64 `json:"realtimeRuntime,omitempty"` - // CPU period to be used for realtime scheduling (in usecs). - RealtimePeriod *uint64 `json:"realtimePeriod,omitempty"` - // CPUs to use within the cpuset. Default is to use any CPU available. - Cpus *string `json:"cpus,omitempty"` - // List of memory nodes in the cpuset. Default is to use any available memory node. - Mems *string `json:"mems,omitempty"` -} - -// Pids for Linux cgroup 'pids' resource management (Linux 4.3) -type Pids struct { - // Maximum number of PIDs. Default is "no limit". - Limit *int64 `json:"limit,omitempty"` -} - -// Network identification and priority configuration -type Network struct { - // Set class identifier for container's network packets - ClassID *uint32 `json:"classID"` - // Set priority of network traffic for container - Priorities []InterfacePriority `json:"priorities"` -} - -// Resources has container runtime resource constraints -type Resources struct { - // DisableOOMKiller disables the OOM killer for out of memory conditions - DisableOOMKiller *bool `json:"disableOOMKiller,omitempty"` - // Specify an oom_score_adj for the container. - OOMScoreAdj *int `json:"oomScoreAdj,omitempty"` - // Memory restriction configuration - Memory *Memory `json:"memory,omitempty"` - // CPU resource restriction configuration - CPU *CPU `json:"cpu,omitempty"` - // Task resource restriction configuration. - Pids *Pids `json:"pids,omitempty"` - // BlockIO restriction configuration - BlockIO *BlockIO `json:"blockIO,omitempty"` - // Hugetlb limit (in bytes) - HugepageLimits []HugepageLimit `json:"hugepageLimits,omitempty"` - // Network restriction configuration - Network *Network `json:"network,omitempty"` -} - -// Device represents the information on a Linux special device file -type Device struct { - // Path to the device. - Path string `json:"path"` - // Device type, block, char, etc. - Type rune `json:"type"` - // Major is the device's major number. - Major int64 `json:"major"` - // Minor is the device's minor number. - Minor int64 `json:"minor"` - // Cgroup permissions format, rwm. - Permissions string `json:"permissions"` - // FileMode permission bits for the device. - FileMode os.FileMode `json:"fileMode"` - // UID of the device. - UID uint32 `json:"uid"` - // Gid of the device. - GID uint32 `json:"gid"` -} - -// Seccomp represents syscall restrictions -type Seccomp struct { - DefaultAction Action `json:"defaultAction"` - Architectures []Arch `json:"architectures"` - Syscalls []*Syscall `json:"syscalls"` -} - -// Arch used for additional architectures -type Arch string - -// Additional architectures permitted to be used for system calls -// By default only the native architecture of the kernel is permitted -const ( - ArchX86 Arch = "SCMP_ARCH_X86" - ArchX86_64 Arch = "SCMP_ARCH_X86_64" - ArchX32 Arch = "SCMP_ARCH_X32" - ArchARM Arch = "SCMP_ARCH_ARM" - ArchAARCH64 Arch = "SCMP_ARCH_AARCH64" - ArchMIPS Arch = "SCMP_ARCH_MIPS" - ArchMIPS64 Arch = "SCMP_ARCH_MIPS64" - ArchMIPS64N32 Arch = "SCMP_ARCH_MIPS64N32" - ArchMIPSEL Arch = "SCMP_ARCH_MIPSEL" - ArchMIPSEL64 Arch = "SCMP_ARCH_MIPSEL64" - ArchMIPSEL64N32 Arch = "SCMP_ARCH_MIPSEL64N32" -) - -// Action taken upon Seccomp rule match -type Action string - -// Define actions for Seccomp rules -const ( - ActKill Action = "SCMP_ACT_KILL" - ActTrap Action = "SCMP_ACT_TRAP" - ActErrno Action = "SCMP_ACT_ERRNO" - ActTrace Action = "SCMP_ACT_TRACE" - ActAllow Action = "SCMP_ACT_ALLOW" -) - -// Operator used to match syscall arguments in Seccomp -type Operator string - -// Define operators for syscall arguments in Seccomp -const ( - OpNotEqual Operator = "SCMP_CMP_NE" - OpLessThan Operator = "SCMP_CMP_LT" - OpLessEqual Operator = "SCMP_CMP_LE" - OpEqualTo Operator = "SCMP_CMP_EQ" - OpGreaterEqual Operator = "SCMP_CMP_GE" - OpGreaterThan Operator = "SCMP_CMP_GT" - OpMaskedEqual Operator = "SCMP_CMP_MASKED_EQ" -) - -// Arg used for matching specific syscall arguments in Seccomp -type Arg struct { - Index uint `json:"index"` - Value uint64 `json:"value"` - ValueTwo uint64 `json:"valueTwo"` - Op Operator `json:"op"` -} - -// Syscall is used to match a syscall in Seccomp -type Syscall struct { - Name string `json:"name"` - Action Action `json:"action"` - Args []*Arg `json:"args"` -}