From c633925c39fd69a9a52031ab2ddab5f00e75abb5 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Sun, 1 Oct 2023 16:07:12 +1100 Subject: [PATCH] [YUNIKORN-2009] placeholder pod limit support (#684) Due to the way the LimitRange controller handles default limits the current placeholder create could break. In YUNIKORN-1908 limits for extended resources were added. To support a LimitRange with defaults set we need to add limits to all resources, not just extended resources. Otherwise the LimitRange controller could update placeholder pods and leave them with an unschedulable config. Closes: #684 Signed-off-by: Wilfred Spiegelenburg --- pkg/cache/placeholder.go | 3 +-- pkg/cache/placeholder_test.go | 4 +-- pkg/common/utils/gang_utils.go | 25 ------------------ pkg/common/utils/gang_utils_test.go | 39 ----------------------------- 4 files changed, 3 insertions(+), 68 deletions(-) diff --git a/pkg/cache/placeholder.go b/pkg/cache/placeholder.go index 559864db1..030dc68d2 100644 --- a/pkg/cache/placeholder.go +++ b/pkg/cache/placeholder.go @@ -86,7 +86,6 @@ func newPlaceholder(placeholderName string, app *Application, taskGroup interfac // prepare the resource lists requests := utils.GetPlaceholderResourceRequests(taskGroup.MinResource) - limits := utils.GetPlaceholderResourceLimits(taskGroup.MinResource) placeholderPod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: placeholderName, @@ -112,7 +111,7 @@ func newPlaceholder(placeholderName string, app *Application, taskGroup interfac ImagePullPolicy: v1.PullIfNotPresent, Resources: v1.ResourceRequirements{ Requests: requests, - Limits: limits, + Limits: requests, }, }, }, diff --git a/pkg/cache/placeholder_test.go b/pkg/cache/placeholder_test.go index 0be88637d..9c30823b4 100644 --- a/pkg/cache/placeholder_test.go +++ b/pkg/cache/placeholder_test.go @@ -235,7 +235,7 @@ func TestNewPlaceholderExtendedResources(t *testing.T) { app.setTaskGroups(taskGroups) holder := newPlaceholder("ph-name", app, app.taskGroups[0]) assert.Equal(t, len(holder.pod.Spec.Containers[0].Resources.Requests), 5, "expected requests not found") - assert.Equal(t, len(holder.pod.Spec.Containers[0].Resources.Limits), 2, "limit for extended resource not found") + assert.Equal(t, len(holder.pod.Spec.Containers[0].Resources.Limits), 5, "expected limits not found") assert.Equal(t, holder.pod.Spec.Containers[0].Resources.Limits[gpu], holder.pod.Spec.Containers[0].Resources.Requests[gpu], "gpu: expected same value for request and limit") assert.Equal(t, holder.pod.Spec.Containers[0].Resources.Limits[hugepages], holder.pod.Spec.Containers[0].Resources.Requests[hugepages], "hugepages: expected same value for request and limit") var priority *int32 @@ -272,7 +272,7 @@ func TestNewPlaceholderWithPriorityClassName(t *testing.T) { holder := newPlaceholder("ph-name", app, app.taskGroups[0]) assert.Equal(t, len(holder.pod.Spec.Containers[0].Resources.Requests), 5, "expected requests not found") - assert.Equal(t, len(holder.pod.Spec.Containers[0].Resources.Limits), 2, "limit for extended resource not found") + assert.Equal(t, len(holder.pod.Spec.Containers[0].Resources.Limits), 5, "expected limits not found") assert.Equal(t, holder.pod.Spec.Containers[0].Resources.Limits[gpu], holder.pod.Spec.Containers[0].Resources.Requests[gpu], "gpu: expected same value for request and limit") assert.Equal(t, holder.pod.Spec.Containers[0].Resources.Limits[hugepages], holder.pod.Spec.Containers[0].Resources.Requests[hugepages], "hugepages: expected same value for request and limit") var priority *int32 diff --git a/pkg/common/utils/gang_utils.go b/pkg/common/utils/gang_utils.go index 9f876a3d1..d740892f3 100644 --- a/pkg/common/utils/gang_utils.go +++ b/pkg/common/utils/gang_utils.go @@ -91,31 +91,6 @@ func GetPlaceholderResourceRequests(resources map[string]resource.Quantity) v1.R return resourceReq } -// GetPlaceholderResourceLimits converts the map of resources requested into a list of resources for the limit -// specification that can be added to a pod. This only adds the minimal required resources that do not support -// over commit. The following resources do support over commit: all standard kubernetes resources (except hugepages*) -// All non-over committable, i.e. extended, resources MUST have a limit set to the same value as the request. -func GetPlaceholderResourceLimits(resources map[string]resource.Quantity) v1.ResourceList { - resourceLim := v1.ResourceList{} - for k, v := range resources { - if k == "" || allowOverCommit(k) { - continue - } - resourceLim[v1.ResourceName(k)] = v - } - return resourceLim -} - -// allowOverCommit returns true if the resource can be over committed. -// This comes down to only allow the standard resources cpu, memory and ephemeral-storage to be over committed. -// We deviate from the K8s checks as opaque resources are no longer supported. Opaque resources were the only -// resources in the "kubernetes.io" domain. -// We slip in the "pods" resource on task groups but that does not cause an issue. -func allowOverCommit(name string) bool { - return !strings.Contains(name, "/") && - !strings.HasPrefix(name, v1.ResourceHugePagesPrefix) -} - func GetSchedulingPolicyParam(pod *v1.Pod) *interfaces.SchedulingPolicyParameters { timeout := int64(0) style := constants.SchedulingPolicyStyleParamDefault diff --git a/pkg/common/utils/gang_utils_test.go b/pkg/common/utils/gang_utils_test.go index 8096df24a..ad1ca3f21 100644 --- a/pkg/common/utils/gang_utils_test.go +++ b/pkg/common/utils/gang_utils_test.go @@ -140,24 +140,6 @@ func TestGetSchedulingPolicyParams(t *testing.T) { } } -func Test_allowOverCommit(t *testing.T) { - tests := []struct { - name string - resName string - want bool - }{ - {"standard", "memory", true}, - {"pod as resource", "pods", true}, - {"hugepages", "hugepages-small", false}, - {"extended", "nvidia.com/gpu", false}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, allowOverCommit(tt.resName), tt.want, "incorrect overcommit status") - }) - } -} - func Test_GetPlaceholderResourceRequest(t *testing.T) { tests := []struct { name string @@ -178,24 +160,3 @@ func Test_GetPlaceholderResourceRequest(t *testing.T) { }) } } - -func Test_GetPlaceholderResourceLimits(t *testing.T) { - tests := []struct { - name string - resMap map[string]resource.Quantity - want v1.ResourceList - }{ - {"nil", nil, v1.ResourceList{}}, - {"empty", map[string]resource.Quantity{}, v1.ResourceList{}}, - {"base", map[string]resource.Quantity{"pods": resource.MustParse("1")}, v1.ResourceList{}}, - {"hugepages", map[string]resource.Quantity{"hugepages-huge": resource.MustParse("2")}, v1.ResourceList{"hugepages-huge": resource.MustParse("2")}}, - {"mixed", map[string]resource.Quantity{"pods": resource.MustParse("4"), "nvidia.com/gpu": resource.MustParse("5")}, v1.ResourceList{"nvidia.com/gpu": resource.MustParse("5")}}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := GetPlaceholderResourceLimits(tt.resMap); !reflect.DeepEqual(got, tt.want) { - t.Errorf("GetPlaceholderResourceRequest() = %v, want %v", got, tt.want) - } - }) - } -}