From 41b93bc2406f7ad21b06b90c62088bf36a3a3f77 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Thu, 7 Apr 2022 16:50:29 +0200 Subject: [PATCH] KEP-127: Changes for alpha and mark implementable This adds the needed KEP metadata, CRI changes, etc. Signed-off-by: Rodrigo Campos Co-authored-by: Giuseppe Scrivano Signed-off-by: Giuseppe Scrivano --- keps/prod-readiness/sig-node/127.yaml | 6 + keps/sig-node/127-user-namespaces/README.md | 737 ++++++++++++++++++-- keps/sig-node/127-user-namespaces/kep.yaml | 10 +- 3 files changed, 683 insertions(+), 70 deletions(-) create mode 100644 keps/prod-readiness/sig-node/127.yaml diff --git a/keps/prod-readiness/sig-node/127.yaml b/keps/prod-readiness/sig-node/127.yaml new file mode 100644 index 00000000000..acb4e146f6d --- /dev/null +++ b/keps/prod-readiness/sig-node/127.yaml @@ -0,0 +1,6 @@ +# The KEP must have an approver from the +# "prod-readiness-approvers" group +# of http://git.k8s.io/enhancements/OWNERS_ALIASES +kep-number: 127 +alpha: + approver: "wojtek-t" diff --git a/keps/sig-node/127-user-namespaces/README.md b/keps/sig-node/127-user-namespaces/README.md index e74172cab45..c0f8142f01a 100644 --- a/keps/sig-node/127-user-namespaces/README.md +++ b/keps/sig-node/127-user-namespaces/README.md @@ -14,18 +14,23 @@ - [Story 4](#story-4) - [Story 5](#story-5) - [Notes/Constraints/Caveats](#notesconstraintscaveats) - - [UID space](#uid-space) - - [Volume support](#volume-support) - [Risks and Mitigations](#risks-and-mitigations) - [Design Details](#design-details) - [Pod.spec changes](#podspec-changes) + - [CRI changes](#cri-changes) - [Phases](#phases) - [Phase 1: pods "without" volumes](#phase-1-pods-without-volumes) + - [pkg/volume changes for phase I](#pkgvolume-changes-for-phase-i) - [Phase 2: pods with volumes](#phase-2-pods-with-volumes) - [Phase 3: TBD](#phase-3-tbd) - [Unresolved](#unresolved) - [Summary of the Proposed Changes](#summary-of-the-proposed-changes) - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [critests](#critests) - [Graduation Criteria](#graduation-criteria) - [Alpha](#alpha) - [Beta](#beta) @@ -42,20 +47,20 @@ - [Implementation History](#implementation-history) - [Drawbacks](#drawbacks) - [Alternatives](#alternatives) -- [References](#references) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) ## Release Signoff Checklist Items marked with (R) are required *prior to targeting to a milestone / release*. -- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) -- [ ] (R) KEP approvers have approved the KEP status as `implementable` -- [ ] (R) Design details are appropriately documented -- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input -- [ ] (R) Graduation criteria is in place -- [ ] (R) Production readiness review completed -- [ ] Production readiness review approved +- [X] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [X] (R) KEP approvers have approved the KEP status as `implementable` +- [X] (R) Design details are appropriately documented +- [X] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input +- [X] (R) Graduation criteria is in place +- [X] (R) Production readiness review completed +- [X] Production readiness review approved - [ ] "Implementation History" section is up-to-date for milestone - [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] - [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes @@ -183,10 +188,6 @@ impact a compromised pod can have on other pods and the node itself. ### Notes/Constraints/Caveats -#### UID space - -#### Volume support - ### Risks and Mitigations ## Design Details @@ -195,10 +196,71 @@ impact a compromised pod can have on other pods and the node itself. The following changes will be done to the pod.spec: -- `pod.spec.hostUsers`: bool. +- `pod.spec.hostUsers`: `*bool`. If true or not present, uses the host user namespace (as today) If false, a new userns is created for the pod. -By default it is set to `true`. +By default it is not set, which implies using the host user namespace. + +### CRI changes + +The following messages will be added: + +``` +// IDMapping describes host to container ID mappings for a pod sandbox. +message IDMapping { + // host_id is the id on the host. + uint32 host_id = 1; + // container_id is the id in the container. + uint32 container_id = 2; + // length is the size of the range to map. + uint32 length = 3; +} + +// UserNamespace describes the intended user namespace configuration for a sandbox. +message UserNamespace { + // NamespaceMode: `POD` or `NODE` + NamespaceMode mode = 1; + + // uids specifies the UID mappings for the user namespace. + repeated IDMapping uids = 2; + + // gids specifies the GID mappings for the user namespace. + repeated IDMapping gids = 3; +} +``` + +The existing message `NamespaceOption` will have a `userns_options` field added. +The complete `NamespaceOption` message with the new field is shown here: + +``` +// NamespaceOption provides options for Linux namespaces. +message NamespaceOption { + // Network namespace for this container/sandbox. + // Note: There is currently no way to set CONTAINER scoped network in the Kubernetes API. + // Namespaces currently set by the kubelet: POD, NODE + NamespaceMode network = 1; + // PID namespace for this container/sandbox. + // Note: The CRI default is POD, but the v1.PodSpec default is CONTAINER. + // The kubelet's runtime manager will set this to CONTAINER explicitly for v1 pods. + // Namespaces currently set by the kubelet: POD, CONTAINER, NODE, TARGET + NamespaceMode pid = 2; + // IPC namespace for this container/sandbox. + // Note: There is currently no way to set CONTAINER scoped IPC in the Kubernetes API. + // Namespaces currently set by the kubelet: POD, NODE + NamespaceMode ipc = 3; + // Target Container ID for NamespaceMode of TARGET. This container must have been + // previously created in the same pod. It is not possible to specify different targets + // for each namespace. + string target_id = 4; + // User namespace for this sandbox. + // The Kubelet picks the user namespace configuration to use for the sandbox. The mappings + // are specified as part of the UserNamespace struct. If the struct is nil, then the POD mode + // must be assumed. This is done for backward compatibility with older Kubelet versions that + // do not set a user namespace. + UserNamespace userns_options = 5; +} + +``` ### Phases @@ -206,9 +268,7 @@ We propose to divide the work in 3 phases. Each phase makes this work with either more isolation or more workloads. When no support is yet added to handle some workload, a clear error will be shown. -PLEASE note that only phase 1 is targeted for alpha. Also note that the missing -details (CRI changes, changes needed in container runtimes, etc.) will be added -in follow-up PRs. +PLEASE note that only phase 1 is targeted for alpha. Please note the last sub-section here is a table with the summary of the changes proposed on each phase. That table is not updated (it is from the initial @@ -234,7 +294,8 @@ This list of volumes was chosen as they can't be used to share files with other pods. The mapping length will be 65535, mapping the range 0-65534 to the pod. This wide -range makes sure most workloads will work fine. +range makes sure most workloads will work fine. Additionally, we don't need to +worry about fragmentation of IDs, as all pods will use the same length. The mapping will be chosen by the kubelet, using a simple algorithm to give different pods in this category ("without" volumes) a non-overlapping mapping. @@ -245,11 +306,72 @@ mapping length of 65535 (2^16-1) top. This imposes a limit of 65k pods per node, but that is not an issue we will hit in practice for a long time, if ever (today we run 110 pods per node by default). +Please note that a mapping is a bijection of host IDs to container IDs. The +settings `RunAsUser`, `RunAsGroup`, `runAsNonRoot` in +`pod.spec.containers.SecurityContext` will refer to the ID used inside the +container. Therefore, the host IDs the kubelet choses for the pod are not +important for the end user changing the pod spec. + +The picked range will be stored under a file named `userns` in the pod folder +(by default it is usually located in `/var/lib/kubelet/pods/$POD/userns`). This +way, the Kubelet can read all the allocated mappings if it restarts. + During phase 1, to make sure we don't exhaust the host UID namespace, we will limit the number of pods using user namespaces to `min(maxPods, 1024)`. This leaves us plenty of host UID space free and this limits is probably never hit in practice. See UNRESOLVED for more some UNRESOLVED info we still have on this. +##### pkg/volume changes for phase I + +We need the files created by the kubelet for these volume types to be readable +by the user the pod is mapped to in the host. Luckily, Kuberentes already +supports specifying an [fsUser and fsGroup for volumes][volume-mounter-args] for +projected SA tokens so we just need to extend that to the other volume types we +support in phase I. + +First, we need to teach the [operation executor][operation-executor] to convert +the container UIDs/GIDs to the host UIDs/GIDs that this pod is mapped to. To do +this, we will modify the [volumeHost interface][volumeHost-interface] adding +functions to transform a container ID to the corresponding host ID, so the +operation executor will just call that function (via +self.volumePluginMgr.Host.GetHostIDsForPod()) + +This function will call the kubelet, which will use the user namespace manager +created to transform the IDs for that pod. The [volumeHost +interface][volumeHost-interface] will then have this new method: + +``` +GetHostIDsForPod(pod *v1.Pod, containerUID, containerGID *int64) (hostUID, hostGID *int64, err error) +``` + +This method will only transform the IDs if the pod is running with userns +enabled. If userns is disabled, the params will not be translated and, also, +GetHostIDsForPod() will return nil if the containerUID/containerGID received +were nil. This is to keep the current functionality untouched, as [fsUser and +fsGroup can be set to nil][operation-executor] today by the operation executor. + +Secondly, we need to modify these volume types to honor the `fsUser` passed in +mounterArgs to their SetUpAt() method. As the [AtomicWriter already knows and +honors the FsUser field][atomic-writer-fsUser] already, just setting this field +is missing when creating the FileProjection (only done when the feature gate is +enabled). For example, configmaps will just need to set FsUser alongside Data +and Mode [here][configmap-fsuser] (and all the other paths in that function that +create the projection, of course). Other volume types are quite similar too. + +For this we need to modify the MakePayload() or CollectData() functions of each +supported volume type to handle the fsUser new param, and the corresponding +volume SetUpAt() function to pass this new param when calling AtomicWriter. + +We have written already this code and the per-volume diff is 3 lines. + +For fsGroup() we don't need any changes, these volume types already honors it. + +[volume-mounter-args]: https://github.com/kubernetes/kubernetes/blob/cfd69463deeebfc5ae8a0813d7d2b125033c4aca/pkg/volume/volume.go#L125-L129 +[operation-executor]: https://github.com/kubernetes/kubernetes/blob/cfd69463deeebfc5ae8a0813d7d2b125033c4aca/pkg/volume/util/operationexecutor/operation_generator.go#L674-L675 +[volumeHost interface]: https://github.com/kubernetes/kubernetes/blob/cfd69463deeebfc5ae8a0813d7d2b125033c4aca/pkg/volume/plugins.go#L360-L361 +[atomic-writer-fsUser]: https://github.com/kubernetes/kubernetes/blob/cfd69463deeebfc5ae8a0813d7d2b125033c4aca/pkg/volume/util/atomic_writer.go#L68 +[configmap-fsuser]: https://github.com/kubernetes/kubernetes/blob/cfd69463deeebfc5ae8a0813d7d2b125033c4aca/pkg/volume/configmap/configmap.go#L272-L273 + #### Phase 2: pods with volumes This phase makes user namespaces work in pods with volumes too. This is @@ -267,6 +389,12 @@ listed vulnerabilities (as the host is protected from the container). It is also a trivial next-step to take, given that we have phase 1 implemented: just return the same mapping if the pod has other volumes. +While these pods do not use a distinct user namespace mapping, they are still +using a new user namespace object in the kernel (so they cannot join/attack +other pods namespaces). Security-wise this is a middle layer between what we +have today (no userns at all) and using a distinct UID/GID mapping for the user +namespace. + #### Phase 3: TBD #### Unresolved @@ -315,24 +443,19 @@ Idem before, see Sergey idea from previous item. the KEP to implementable (or if there are long delays to get a review, we might decide to do it after the first alpha, as the community prefers and time allows). Same applies for VM runtimes. + UPDATE: Windows maintainers reviewed and [this change looks good to them][windows-review]. + +[windows-review]: https://github.com/kubernetes/enhancements/pull/3275#issuecomment-1121603308 ### Summary of the Proposed Changes [This table](https://docs.google.com/presentation/d/1z4oiZ7v4DjWpZQI2kbFbI8Q6botFaA07KJYaKA-vZpg/edit#slide=id.gfd10976c8b_1_41) gives you a quick overview of each phase (note it is outdated, but still useful for a general overview). - ### Test Plan -TBD +[x] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes necessary +to implement this enhancement. + +##### Prerequisite testing updates + + + +##### Unit tests + + + + + +Most of the changes will be in a new file we will create, the userns manager. +Tests are already written for that file. + +The rest of the changes are more about hooking it up so it is called. + +- `pkg/kubelet/kuberuntime/kuberuntime_container_linux.go`: 24.05.2022 - 90.8% +- `pkg/volume/util/operationexecutor/operation_generator.go`: 24.05.2022 - 8.6% +- `pkg/volume/configmap/configmap.go`: 24.05.2022 - 74.8% +- `pkg/volume/downwardapi/downwardapi.go`: 24.05.2022 - 61.7% +- `pkg/volume/secret/secret.go`: 24.05.2022 - 65.7% +- `pkg/volume/projected/projected.go`: 24.05.2022 - 68.2% + +##### Integration tests + + + +We will add a test that: + +1. Starts the kubelet with the feature gate enabled +1. Creates a pod with user namespaces +1. Restarts the kubelet with the feature gate disabled +1. Deletes the pod + +We will check the pod is deleted correctly and the pod folder is indeed deleted. + +Same note about container runtime version from the e2e section applies. + +- : + +##### e2e tests + + + +We can do e2e test to verify that userns is disabled when it should. + +To test with userns enabled, we need to patch container runtimes. We can either +try to use a patched version or make the alpha longer and add e2e when we can +use container runtime versions that have the needed changes. + +##### critests + +- For Alpha, the feature is tested for containerd and CRI-O in cri-tools repo using critest to + make sure the specified user namespace configuration is honored. + +- : ### Graduation Criteria @@ -363,88 +576,476 @@ Note this section is a WIP yet. ### Version Skew Strategy + + +Some definitions first: +- New kubelet: kubelet with CRI proto files that includes the changes proposed in +this KEP. + +- Old kubelet: idem, but CRI proto files don't include this changes. + +- New runtime: container runtime with CRI proto files that includes the changes +proposed in this KEP. + +- Old runtime: idem, but CRI proto files don't include this changes. + +New runtime and old kubelet: containers are created without userns. Kubelet +doesn't request userns (doesn't have that feature) and therefore the runtime +doesn't create them. The runtime can detect this situation as the `user` field +in the `NamespaceOption` will be seen as nil, [thanks to +protobuf][proto3-defaults]. We already tested this with real code. + +Old runtime and new kubelet: containers are created without userns. As the +`user` field of the `NamespaceOption` message is not part of the runtime +protofiles, that part is ignored by the runtime and pods are created using the +host userns. + + +[proto3-defaults]: https://developers.google.com/protocol-buffers/docs/proto3#default + ## Production Readiness Review Questionnaire + + ### Feature Enablement and Rollback -* **How can this feature be enabled / disabled in a live cluster?** - - [ ] Feature gate - - Feature gate name: - - Components depending on the feature gate: + + +###### How can this feature be enabled / disabled in a live cluster? + + + +- [x] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: UserNamespacesSupport + - Components depending on the feature gate: kubelet, kube-apiserver + +###### Does enabling the feature change any default behavior? -* **Does enabling the feature change any default behavior?** +No. -* **Can the feature be disabled once it has been enabled (i.e. can we roll back - the enablement)?** + + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + +Yes. Pods that were created and opted-in to use user namespaces, will have to be +recreated to run again without user namespaces. + +Other than that, it is safe to disable once it was enabled. + +It is also safe to: + +1. Enable the feature gate +1. Create a pod with user namespaces, +1. Disable the feature gate in the kubelet (and restart it) +1. Delete the pod + +We won't leak files nor other resources on that scenario either. + + + +###### What happens if we reenable the feature if it was previously rolled back? + +Pods will have to be re-created to use the feature. + +###### Are there any tests for feature enablement/disablement? + +We will add. + +We will test for when the field pod.spec.HostUsers is set to true, false +and not set. All of this with and without the feature gate enabled. -* **What happens if we reenable the feature if it was previously rolled back?** +We will also unit test that, if pods were created with the new field +pod.specHostUsers, then if the featuregate is disabled all works as expected (no +user namespace is used). -* **Are there any tests for feature enablement/disablement?** + ### Rollout, Upgrade and Rollback Planning -Will be added before transition to beta. + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + -* **What specific metrics should inform a rollback?** +###### What specific metrics should inform a rollback? -* **Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?** + -* **Is the rollout accompanied by any deprecations and/or removals of features, APIs, -fields of API types, flags, etc.?** +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + ### Monitoring Requirements -Will be added before transition to beta. + -* **What are the SLIs (Service Level Indicators) an operator can use to determine -the health of the service?** +###### How can an operator determine if the feature is in use by workloads? -* **What are the reasonable SLOs (Service Level Objectives) for the above SLIs?** + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: -* **Are there any missing metrics that would be useful to have to improve observability -of this feature?** +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + ### Dependencies -* **Does this feature depend on any specific services running in the cluster?**: No. + + +###### Does this feature depend on any specific services running in the cluster? + + ### Scalability -* **Will enabling / using this feature result in any new API calls?** No. + + +###### Will enabling / using this feature result in any new API calls? + +No. -* **Will enabling / using this feature result in introducing new API types?** No. + -* **Will enabling / using this feature result in any new calls to the cloud -provider?** No. +###### Will enabling / using this feature result in introducing new API types? -* **Will enabling / using this feature result in increasing size or count of -the existing API objects?** Yes. The PodSpec will be increased. +No. -* **Will enabling / using this feature result in increasing time taken by any -operations covered by [existing SLIs/SLOs]?** + -* **Will enabling / using this feature result in non-negligible increase of -resource usage (CPU, RAM, disk, IO, ...) in any components?**: No. +###### Will enabling / using this feature result in any new calls to the cloud provider? + +No. + + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + +Yes. The pod.Spec.HostUsers field is a bool, should be small. + + + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + +Not in any Kubernetes component, it might take more time for the container +runtime to start the pod _if they support old kernels_. + +With the info below, we decided that container runtimes _can_ support old +kernels, but the SLO will be excluded in that case. + +The SLO that might be affected is: + +> Startup latency of schedulable stateless pods, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch, measured as 99th percentile over last 5 minutes + +The rootfs needs to be accessible by the user in the user namespace the pod is. +As every pod might run as a different user, we only know the mapping for a pod +when it is created. + +If new kernels are used (5.19+), the container runtime can use idmapped mounts +in the rootfs and the UID/GID shifting needed so the pod can access the rootfs +takes a few ms (it is just a bind mount). + +If the container runtimes want to support old kernels too, they will need to +chown the rootfs to the new UIDs/GIDs. This can be slow, although the metacopy +overlayfs parameter helps so it is not so bad. + +The options we have for this plumbing to setup the rootfs: + * Consider it the same way we consider the image pull, and explicitelly exclude + it from the SLO. + * Tell runtimes to not support old kernels (KEP authors are working in the + containerd and CRI-O implementation too, we can easily do this) + * Tell runtimes to support old kernels if they want, but with a big fat warning + in the docs about the implications + +In any case, the kubernetes components do not need any change. + + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + +Not in any kubernetes component. + +The container runtime might use more disk, if the runtime supports this feature +in old kernels, as that means the runtime needs to chown the rootfs (see +previous question for more details). + +This is not needed on newer kernels, as they can rely on idmapped mounts for the +UID/GID shifting (it is just a bind mount). + + ### Troubleshooting -Will be added before transition to beta. + + +###### How does this feature react if the API server and/or etcd is unavailable? -* **How does this feature react if the API server and/or etcd is unavailable?** +###### What are other known failure modes? -* **What are other known failure modes?** + -* **What steps should be taken if SLOs are not being met to determine the problem?** +###### What steps should be taken if SLOs are not being met to determine the problem? ## Implementation History + + ## Drawbacks + + ## Alternatives -## References + + +## Infrastructure Needed (Optional) + + diff --git a/keps/sig-node/127-user-namespaces/kep.yaml b/keps/sig-node/127-user-namespaces/kep.yaml index b0a8798cf2c..ba5f38ffee2 100644 --- a/keps/sig-node/127-user-namespaces/kep.yaml +++ b/keps/sig-node/127-user-namespaces/kep.yaml @@ -3,10 +3,9 @@ kep-number: 127 authors: - "@rata" - "@giuseppe" - - "@mauriciovasquezbernal" owning-sig: sig-node participating-sigs: [] -status: provisional +status: implementable creation-date: 2021-11-02 reviewers: - "@mrunalp" @@ -14,8 +13,15 @@ reviewers: - "@thockin" approvers: - "@derekwaynecarr" + +stage: alpha +latest-milestone: "v1.25" +milestone: + alpha: "v1.25" + feature-gates: - name: UserNamespacesSupport components: - kubelet + - kube-apiserver disable-supported: true