-
Notifications
You must be signed in to change notification settings - Fork 76
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix scale up with no assigned node #467
Changes from all commits
8ca7086
79408ee
8b8291d
ae12903
3669799
65b6944
15082e0
2529a3e
c6545e1
c4c74f8
692e89d
ff620db
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,6 +46,19 @@ type ClusterResource struct { | |
MemoryRequestMega int64 | ||
MemoryLimitMega int64 | ||
MemoryTotalMega int64 | ||
|
||
NodeInfos NodeInfos | ||
} | ||
|
||
// NodeInfos is the information of all nodes. | ||
type NodeInfos struct { | ||
NodesCPUIdleMilli map[string]int64 | ||
NodesMemoryFreeMega map[string]int64 | ||
} | ||
|
||
// String returns the string that represents NodeInfo when printed. | ||
func (n NodeInfos) String() string { | ||
return fmt.Sprintf("NodeInfo(%d nodes)", len(n.NodesCPUIdleMilli)) | ||
} | ||
|
||
// Cluster represents the cluster managment system such as Kubernetes. | ||
|
@@ -222,6 +235,26 @@ nextJob: | |
return js | ||
} | ||
|
||
func searchAssignableNodeByCPU(r *ClusterResource, j job) (assignable bool) { | ||
for _, idle := range r.NodeInfos.NodesCPUIdleMilli { | ||
if j.TrainerCPURequestMilli() <= idle { | ||
return true | ||
} | ||
} | ||
log.Debug("No node is assignable, job CPU is ", j.TrainerMemRequestMega()) | ||
return false | ||
} | ||
|
||
func searchAssignableNodeByMem(r *ClusterResource, j job) (assignable bool) { | ||
for _, idle := range r.NodeInfos.NodesMemoryFreeMega { | ||
if j.TrainerMemRequestMega() <= idle { | ||
return true | ||
} | ||
} | ||
log.Debug("No node is assignable, job memory is ", j.TrainerMemRequestMega()) | ||
return false | ||
} | ||
|
||
func scaleDryRun(r *ClusterResource, j job, curDiff int, maxLoadDesired float64, scaleDown bool) (additional int) { | ||
additionalGPUInstance := 0 | ||
additionalCPUInstance := 0 | ||
|
@@ -282,6 +315,12 @@ func scaleDryRun(r *ClusterResource, j job, curDiff int, maxLoadDesired float64, | |
return | ||
} | ||
|
||
if !searchAssignableNodeByCPU(r, j) || !searchAssignableNodeByMem(r, j) { | ||
// can not find assignable node, do not scale | ||
additional = 0 | ||
return | ||
} | ||
|
||
// NOTE: do not scale up to use full cluster resource of CPU | ||
// but we do scale up for GPU. | ||
if int64(float64(r.CPUTotalMilli)*maxLoadDesired)-r.CPURequestMilli >= cpuRequestMilli { | ||
|
@@ -349,43 +388,41 @@ func scaleAllDryRun(jobs []job, r ClusterResource, maxLoadDesired float64) map[s | |
return diff | ||
} | ||
|
||
func (a *Autoscaler) scaleAll(diff map[string]int) { | ||
for name := range diff { | ||
if diff[name] != 0 { | ||
log.Info("scaling job", | ||
"name", name, "number of instances", diff[name]) | ||
target := *a.jobs[name].TrainerJob.Spec.Parallelism + int32(diff[name]) | ||
|
||
var err error | ||
for retry := 5; retry > 0; retry-- { | ||
var tj *batchv1.Job | ||
// don't shadow err | ||
tj, err = a.cluster.GetTrainerJob(a.jobs[name].Config) | ||
if err != nil { | ||
log.Warn("sync trainer job error", | ||
"error", err, "remaining retry", retry) | ||
continue | ||
} | ||
j := a.jobs[name] | ||
// NOTE: only update batchv1.job from k8s api-server before updating | ||
// for efficiency. Update the job resource to get latest k8s | ||
// resource reversion. | ||
j.TrainerJob = tj | ||
a.jobs[name] = j | ||
*a.jobs[name].TrainerJob.Spec.Parallelism = target | ||
err = a.cluster.UpdateTrainerJob(a.jobs[name].TrainerJob) | ||
if err != nil { | ||
log.Error("error updating trainer job", | ||
"error", err, "remaining retry", retry) | ||
continue | ||
} | ||
|
||
break | ||
func (a *Autoscaler) scaleAll(target map[string]int) { | ||
for name := range target { | ||
log.Info("scaling job", | ||
"name", name, "target", target[name]) | ||
target := int32(target[name]) | ||
|
||
var err error | ||
for retry := 5; retry > 0; retry-- { | ||
var tj *batchv1.Job | ||
// don't shadow err | ||
tj, err = a.cluster.GetTrainerJob(a.jobs[name].Config) | ||
if err != nil { | ||
log.Warn("sync trainer job error", | ||
"error", err, "remaining retry", retry) | ||
continue | ||
} | ||
|
||
j := a.jobs[name] | ||
// NOTE: only update batchv1.job from k8s api-server before updating | ||
// for efficiency. Update the job resource to get latest k8s | ||
// resource reversion. | ||
j.TrainerJob = tj | ||
a.jobs[name] = j | ||
*a.jobs[name].TrainerJob.Spec.Parallelism = target | ||
err = a.cluster.UpdateTrainerJob(a.jobs[name].TrainerJob) | ||
if err != nil { | ||
log.Warn("Error updating trainer job", "error", err) | ||
log.Error("error updating trainer job", | ||
"error", err, "remaining retry", retry) | ||
continue | ||
} | ||
|
||
break | ||
} | ||
|
||
if err != nil { | ||
log.Warn("Error updating trainer job", "error", err) | ||
} | ||
} | ||
} | ||
|
@@ -444,9 +481,10 @@ func (a *Autoscaler) Monitor() { | |
log.Error("error sync resource", "error", err) | ||
continue | ||
} | ||
log.Info("sync cluster resource done", "resource", r) | ||
|
||
var js []job | ||
for _, j := range a.jobs { | ||
for key, j := range a.jobs { | ||
// k8s job for trainers may not be created immediently | ||
// try sync it here | ||
if j.TrainerJob == nil { | ||
|
@@ -460,7 +498,9 @@ func (a *Autoscaler) Monitor() { | |
continue | ||
} | ||
j.TrainerJob = tj | ||
a.jobs[key] = j | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to write back to the map, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Refine the code later is fine. Also, the scaling algorithm should be updated like:
I'll add a design doc for details. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @typhoonzero ! Btw, maybe |
||
} | ||
|
||
// Scale jobs only when all pods' "Phase" are running. | ||
// Pending/starting/terminating jobs are ignored. | ||
total, running, err := a.cluster.JobPods(j.Config) | ||
|
@@ -473,10 +513,15 @@ func (a *Autoscaler) Monitor() { | |
} | ||
} | ||
diff := scaleAllDryRun(js, r, a.maxLoadDesired) | ||
if len(diff) > 0 { | ||
log.Info("calculated scaling plan", "plan", diff, | ||
target := make(map[string]int) | ||
for k, v := range diff { | ||
target[k] = int(*a.jobs[k].TrainerJob.Spec.Parallelism) + v | ||
} | ||
|
||
if len(target) > 0 { | ||
log.Info("calculated scaling plan", "target", target, | ||
"clusterResource", r) | ||
} | ||
a.scaleAll(diff) | ||
a.scaleAll(target) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -95,6 +95,25 @@ func getPodsTotalRequestsAndLimits(podList *v1.PodList) (reqs v1.ResourceList, l | |
return | ||
} | ||
|
||
func updateNodesIdleResource(podList *v1.PodList, nodesCPUIdleMilli map[string]int64, nodesMemoryFreeMega map[string]int64) (err error) { | ||
for _, pod := range podList.Items { | ||
nodeName := pod.Spec.NodeName | ||
if nodeName == "" { | ||
continue | ||
} | ||
for _, container := range pod.Spec.Containers { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we check only pending and running pods? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only Terminating and Running Pod will take up the resource. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh ok, so maybe we need to check only "Terminating and Running" pod? :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think so, follow the comment https://github.com/kubernetes/kubernetes/blob/v1.6.2/pkg/api/v1/types.go#L2331, if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, so it's not necessary to check pod status, since as long as |
||
nodesCPUIdleMilli[nodeName] -= container.Resources.Requests.Cpu().ScaledValue(resource.Milli) | ||
nodesMemoryFreeMega[nodeName] -= container.Resources.Requests.Memory().ScaledValue(resource.Mega) | ||
} | ||
|
||
for _, container := range pod.Spec.InitContainers { | ||
nodesCPUIdleMilli[nodeName] -= container.Resources.Requests.Cpu().ScaledValue(resource.Milli) | ||
nodesMemoryFreeMega[nodeName] -= container.Resources.Requests.Memory().ScaledValue(resource.Mega) | ||
} | ||
} | ||
return | ||
} | ||
|
||
// SyncResource will update free and total resources in k8s cluster. | ||
func (c *Cluster) SyncResource() (res autoscaler.ClusterResource, err error) { | ||
nodes := c.clientset.CoreV1().Nodes() | ||
|
@@ -103,7 +122,14 @@ func (c *Cluster) SyncResource() (res autoscaler.ClusterResource, err error) { | |
return autoscaler.ClusterResource{}, err | ||
} | ||
allocatable := make(v1.ResourceList) | ||
nodesCPUIdleMilli := make(map[string]int64) | ||
nodesMemoryFreeMega := make(map[string]int64) | ||
|
||
for _, node := range nodeList.Items { | ||
nodesCPUIdleMilli[node.GetObjectMeta().GetName()] = | ||
node.Status.Allocatable.Cpu().ScaledValue(resource.Milli) | ||
nodesMemoryFreeMega[node.GetObjectMeta().GetName()] = | ||
node.Status.Allocatable.Memory().ScaledValue(resource.Mega) | ||
AddResourceList(allocatable, node.Status.Allocatable) | ||
} | ||
|
||
|
@@ -130,6 +156,11 @@ func (c *Cluster) SyncResource() (res autoscaler.ClusterResource, err error) { | |
return autoscaler.ClusterResource{}, err | ||
} | ||
|
||
err = updateNodesIdleResource(allPodsList, nodesCPUIdleMilli, nodesMemoryFreeMega) | ||
if err != nil { | ||
return autoscaler.ClusterResource{}, err | ||
} | ||
|
||
res = autoscaler.ClusterResource{ | ||
NodeCount: len(nodeList.Items), | ||
GPUTotal: int(allocatable.NvidiaGPU().Value()), | ||
|
@@ -143,7 +174,11 @@ func (c *Cluster) SyncResource() (res autoscaler.ClusterResource, err error) { | |
GPULimit: int(allLimits.NvidiaGPU().Value()), | ||
CPULimitMilli: allLimits.Cpu().ScaledValue(resource.Milli), | ||
MemoryLimitMega: allLimits.Memory().ScaledValue(resource.Mega), | ||
} | ||
|
||
NodeInfos: autoscaler.NodeInfos{ | ||
NodesCPUIdleMilli: nodesCPUIdleMilli, | ||
NodesMemoryFreeMega: nodesMemoryFreeMega, | ||
}, | ||
} | ||
return | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to return immediately from here, or else the code below may be executed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.