Skip to content

Commit

Permalink
fix the proportion plugin that ignore the inqueue resource in running…
Browse files Browse the repository at this point in the history
… jobs when podgroup's minResource is not nil

Signed-off-by: Thor-wl <13164644535@163.com>
  • Loading branch information
Thor-wl committed Mar 4, 2022
1 parent 523bc2f commit 0f3d34a
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 1 deletion.
16 changes: 15 additions & 1 deletion pkg/scheduler/plugins/overcommit/overcommit.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,24 @@ func (op *overcommitPlugin) OnSessionOpen(ssn *framework.Session) {
}
op.idleResource = total.Clone().Multi(op.overCommitFactor).Sub(used)

// calculate inqueue job resources
klog.V(4).Infof("op.inqueueResource.cpu: %f", op.inqueueResource.MilliCPU)

for _, job := range ssn.Jobs {
// calculate inqueue job resources
if job.PodGroup.Status.Phase == scheduling.PodGroupInqueue && job.PodGroup.Spec.MinResources != nil {
op.inqueueResource.Add(api.NewResource(*job.PodGroup.Spec.MinResources))
continue
}
// calculate inqueue resource for running jobs
// the judgement 'job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember' will work on cases such as the following condition:
// Considering a Spark job is completed(driver pod is completed) while the podgroup keeps running, the allocated resource will be reserved again if without the judgement.
if job.PodGroup.Status.Phase == scheduling.PodGroupRunning &&
job.PodGroup.Spec.MinResources != nil &&
job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember {
allocated := util.GetAllocatedResource(job)
inqueued := util.GetInqueueResource(job, allocated)
klog.V(4).Infof("reserve resource for running job %s, inqueue: %s", job.Name, *inqueued)
op.inqueueResource.Add(inqueued)
}
}

Expand Down
7 changes: 7 additions & 0 deletions pkg/scheduler/plugins/proportion/proportion.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,13 @@ func (pp *proportionPlugin) OnSessionOpen(ssn *framework.Session) {
if job.PodGroup.Status.Phase == scheduling.PodGroupInqueue {
attr.inqueue.Add(job.GetMinResources())
}
if job.PodGroup.Status.Phase == scheduling.PodGroupRunning &&
job.PodGroup.Spec.MinResources != nil &&
job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember {
allocated := util.GetAllocatedResource(job)
inqueued := util.GetInqueueResource(job, allocated)
attr.inqueue.Add(inqueued)
}
}

// Record metrics
Expand Down
45 changes: 45 additions & 0 deletions pkg/scheduler/plugins/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,48 @@ func NormalizeScore(maxPriority int64, reverse bool, scores []api.ScoredNode) {
scores[idx].Score = score
}
}

// GetAllocatedResource returns allocated resource for given job
func GetAllocatedResource(job *api.JobInfo) *api.Resource {
allocated := &api.Resource{}
for status, tasks := range job.TaskStatusIndex {
if api.AllocatedStatus(status) {
for _, t := range tasks {
allocated.Add(t.Resreq)
}
}
}
return allocated
}

// GetInqueueResource returns reserved resource for running job whose part of pods have not been allocated resource.
func GetInqueueResource(job *api.JobInfo, allocated *api.Resource) *api.Resource {
inqueue := &api.Resource{}
for rName, rQuantity := range *job.PodGroup.Spec.MinResources {
switch rName {
case v1.ResourceCPU:
reservedCpu := float64(rQuantity.Value()) - allocated.MilliCPU
if reservedCpu > 0 {
inqueue.MilliCPU = reservedCpu
}
case v1.ResourceMemory:
reservedMemory := float64(rQuantity.Value()) - allocated.Memory
if reservedMemory > 0 {
inqueue.Memory = reservedMemory
}
default:
if inqueue.ScalarResources == nil {
inqueue.ScalarResources = make(map[v1.ResourceName]float64)
}
if allocatedMount, ok := allocated.ScalarResources[rName]; !ok {
inqueue.ScalarResources[rName] = float64(rQuantity.Value())
} else {
reservedScalarRes := float64(rQuantity.Value()) - allocatedMount
if reservedScalarRes > 0 {
inqueue.ScalarResources[rName] = reservedScalarRes
}
}
}
}
return inqueue
}

0 comments on commit 0f3d34a

Please sign in to comment.