From 518da33397bebd0589f64bf9a9fe86e6762cbbe1 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 8 Jul 2019 19:10:24 +0800 Subject: [PATCH 1/2] Fix scheduler panic happens when the GPU is lost on node --- pkg/scheduler/api/node_info.go | 20 ++++++++++++++++++-- pkg/scheduler/api/node_info_test.go | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go index 77625a41ef..a8c7d15885 100644 --- a/pkg/scheduler/api/node_info.go +++ b/pkg/scheduler/api/node_info.go @@ -161,6 +161,18 @@ func (ni *NodeInfo) SetNode(node *v1.Node) { } } +func (ni *NodeInfo) allocateIdleResource(ti *TaskInfo) error { + if ti.Resreq.LessEqual(ni.Idle) { + ni.Idle.Sub(ti.Resreq) + return nil + } + ni.State = NodeState{ + Phase: NotReady, + Reason: "OutOfSync", + } + return fmt.Errorf("Selected node NotReady") +} + // AddTask is used to add a task in nodeInfo object func (ni *NodeInfo) AddTask(task *TaskInfo) error { key := PodKey(task.Pod) @@ -176,12 +188,16 @@ func (ni *NodeInfo) AddTask(task *TaskInfo) error { if ni.Node != nil { switch ti.Status { case Releasing: + if err := ni.allocateIdleResource(ti); err != nil { + return err + } ni.Releasing.Add(ti.Resreq) - ni.Idle.Sub(ti.Resreq) case Pipelined: ni.Releasing.Sub(ti.Resreq) default: - ni.Idle.Sub(ti.Resreq) + if err := ni.allocateIdleResource(ti); err != nil { + return err + } } ni.Used.Add(ti.Resreq) diff --git a/pkg/scheduler/api/node_info_test.go b/pkg/scheduler/api/node_info_test.go index ae08d45208..6b5c285f75 100644 --- a/pkg/scheduler/api/node_info_test.go +++ b/pkg/scheduler/api/node_info_test.go @@ -37,6 +37,9 @@ func TestNodeInfo_AddPod(t *testing.T) { case01Node := buildNode("n1", buildResourceList("8000m", "10G")) case01Pod1 := buildPod("c1", "p1", "n1", v1.PodRunning, buildResourceList("1000m", "1G"), []metav1.OwnerReference{}, make(map[string]string)) case01Pod2 := buildPod("c1", "p2", "n1", v1.PodRunning, buildResourceList("2000m", "2G"), []metav1.OwnerReference{}, make(map[string]string)) + // case2 + case02Node := buildNode("n2", buildResourceList("2000m", "1G")) + case02Pod1 := buildPod("c2", "p1", "n2", v1.PodUnknown, buildResourceList("1000m", "2G"), []metav1.OwnerReference{}, make(map[string]string)) tests := []struct { name string @@ -63,6 +66,22 @@ func TestNodeInfo_AddPod(t *testing.T) { }, }, }, + { + name: "add 1 unknown pod", + node: case02Node, + pods: []*v1.Pod{case02Pod1}, + expected: &NodeInfo{ + Name: "n2", + Node: case02Node, + Idle: buildResource("2000m", "1G"), + Used: EmptyResource(), + Releasing: EmptyResource(), + Allocatable: buildResource("2000m", "1G"), + Capability: buildResource("2000m", "1G"), + State: NodeState{Phase: NotReady, Reason: "OutOfSync"}, + Tasks: map[TaskID]*TaskInfo{}, + }, + }, } for i, test := range tests { From 52d973211e591147443da0cd6fe1cd57feb679d6 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 8 Jul 2019 19:56:51 +0800 Subject: [PATCH 2/2] Fix the build issue --- pkg/scheduler/api/node_info_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/scheduler/api/node_info_test.go b/pkg/scheduler/api/node_info_test.go index 6b5c285f75..4ccbfb9985 100644 --- a/pkg/scheduler/api/node_info_test.go +++ b/pkg/scheduler/api/node_info_test.go @@ -79,7 +79,7 @@ func TestNodeInfo_AddPod(t *testing.T) { Allocatable: buildResource("2000m", "1G"), Capability: buildResource("2000m", "1G"), State: NodeState{Phase: NotReady, Reason: "OutOfSync"}, - Tasks: map[TaskID]*TaskInfo{}, + Tasks: map[TaskID]*TaskInfo{}, }, }, }