Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Commit

Permalink
Split higher level cell when allocated bad cells (#27)
Browse files Browse the repository at this point in the history
* Split higher level cell when allocated bad cells

When buddy allocation failed due to bad cells,
try to split a higher level cell to get current level cells.

* Fix deletion errors

Fix deletion errors.

* Add allocate function to split higher level cells

Add allocate function to split higher level cells.

* Fix test

Fix test.

* Add test case

Add test case.

* Resolve comments

Resolve comments.

* Add free list in panic log when safety is broken

Add free list in panic log when safety is broken.

* Add test case when unable to split

Add test case when unable to split due to safety guarantee.

* Update

Update.

* Fix memory leak in removePickedGpus

Fix memory leak in `removePickedGpus`.

* Early stop safety check at current level

Early stop safety check at current level.
  • Loading branch information
abuccts authored Jul 15, 2020
1 parent 406f379 commit 76ed604
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 8 deletions.
83 changes: 81 additions & 2 deletions pkg/algorithm/cell_allocation.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@ package algorithm

import (
"fmt"
"sort"

"github.com/microsoft/hivedscheduler/pkg/api"
"github.com/microsoft/hivedscheduler/pkg/common"
"k8s.io/klog"
"sort"
)

// buddyAlloc is used for allocating a free physical cell to a preassigned virtual cell.
Expand Down Expand Up @@ -78,6 +79,76 @@ func buddyAlloc(
return false
}

// after buddyAlloc cannot find a healthy cell (or not in suggested nodes),
// try to split a higher level cell safely to get current level cells
func safeRelaxedBuddyAlloc(
cell *cellBindingPathVertex,
freeList ChainCellList,
freeCellNum map[CellLevel]int32,
currentLevel CellLevel,
suggestedNodes common.Set,
ignoreSuggestedNodes bool,
bindings map[api.CellAddress]*PhysicalCell) bool {

var splittableCell Cell
splittableNum := map[CellLevel]int32{}
for i := CellLevel(len(freeList)); i > currentLevel; i-- {
// calculate splittable number
splittableNum[i] = int32(len(freeList[i])) - freeCellNum[i]
if i < CellLevel(len(freeList)) && splittableCell != nil {
splittableNum[i] += splittableNum[i+1] * int32(len(splittableCell.GetChildren()))
}
// iterate higher level cell
if splittableCell == nil && len(freeList[i]) > 0 {
splittableCell = freeList[i][0]
} else if splittableCell != nil {
splittableCell = splittableCell.GetChildren()[0]
}
// check safety
if splittableNum[i] < 0 {
panic(fmt.Sprintf("VC Safety Broken: level %v cell with free list %v is unsplittable, splittableNum=%v",
i, freeList[i], splittableNum[i]))
}
}

for l := currentLevel + 1; l <= CellLevel(len(freeList)); l++ {
cellNum := int32(len(freeList[l]))
if cellNum > splittableNum[l] {
cellNum = splittableNum[l]
}
if cellNum > 0 {
splitList := CellList{}
for i := int32(0); i < cellNum; i++ {
splitList = append(splitList, freeList[l][0])
freeList.remove(freeList[l][0], l)
}
splittableNum[l] -= cellNum
for sl := l; sl > currentLevel; sl-- {
splitChildrenList := CellList{}
for _, sc := range splitList {
splitChildrenList = append(splitChildrenList, sc.GetChildren()...)
}
splitList = splitChildrenList
}
freeList[currentLevel] = append(splitList, freeList[currentLevel]...)
ok, pickedCells := mapVirtualCellsToPhysical(
[]*cellBindingPathVertex{cell},
freeList[currentLevel],
suggestedNodes,
ignoreSuggestedNodes,
bindings,
true)
if ok {
for _, c := range pickedCells {
freeList.remove(c, currentLevel)
}
return true
}
}
}
return false
}

// getLowestFreeCellLevel returns the lowest level in the free cell list with at least one free cell.
func getLowestFreeCellLevel(freeList ChainCellList, l CellLevel) CellLevel {
for ; l <= CellLevel(len(freeList)); l++ {
Expand All @@ -96,14 +167,22 @@ func mapVirtualPlacementToPhysical(
preassignedCells []*cellBindingPathVertex,
nonPreassignedCells [][]*cellBindingPathVertex,
freeList ChainCellList,
freeCellNum map[CellLevel]int32,
suggestedNodes common.Set,
ignoreSuggestedNodes bool,
bindings map[api.CellAddress]*PhysicalCell) bool {

for _, c := range preassignedCells {
if !buddyAlloc(c, freeList, getLowestFreeCellLevel(
freeList, c.cell.GetLevel()), suggestedNodes, ignoreSuggestedNodes, bindings) {
return false
klog.Info("Buddy allocation failed due to bad cells, try to split higher level cells")
if !safeRelaxedBuddyAlloc(c, freeList, freeCellNum, c.cell.GetLevel(),
suggestedNodes, ignoreSuggestedNodes, bindings) {
klog.Info("Cannot split higher level cells")
return false
}
} else {
freeCellNum[c.cell.GetLevel()]--
}
}
for _, cells := range nonPreassignedCells {
Expand Down
9 changes: 8 additions & 1 deletion pkg/algorithm/hived_algorithm.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@ package algorithm

import (
"fmt"
"sync"

"github.com/microsoft/hivedscheduler/pkg/api"
"github.com/microsoft/hivedscheduler/pkg/common"
"github.com/microsoft/hivedscheduler/pkg/internal"
core "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog"
"sync"
)

// HivedAlgorithm implements an internal.SchedulerAlgorithm. It schedules affinity groups using the algorithm of HiveD.
Expand Down Expand Up @@ -913,10 +914,16 @@ func (h *HivedAlgorithm) scheduleGuaranteedAffinityGroup(
common.SortInt32(gpuNums)
lazyPreemptedGroups := h.tryLazyPreempt(virtualPlacement, gpuNums, sr.affinityGroupName)
preassignedCells, nonPreassignedCells := virtualPlacement.toBindingPaths(gpuNums, bindings)
// make a copy of freeCellNum, may change its values during allocation
freeCellNumCopy := map[CellLevel]int32{}
for k, v := range h.allVCFreeCellNum[sr.chain] {
freeCellNumCopy[k] = v
}
if ok := mapVirtualPlacementToPhysical(
preassignedCells,
nonPreassignedCells,
h.freeCellList[sr.chain].shallowCopy(),
freeCellNumCopy,
sr.suggestedNodes,
sr.ignoreSuggestedNodes,
bindings); ok {
Expand Down
87 changes: 83 additions & 4 deletions pkg/algorithm/hived_algorithm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,16 @@ package algorithm

import (
"fmt"
"net/http"
"sort"
"testing"

"github.com/microsoft/hivedscheduler/pkg/api"
"github.com/microsoft/hivedscheduler/pkg/common"
"github.com/microsoft/hivedscheduler/pkg/internal"
core "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"net/http"
"sort"
"testing"
)

var allPods = map[string]*core.Pod{}
Expand Down Expand Up @@ -63,7 +64,8 @@ func initNodes(h *HivedAlgorithm) {
}

var group1, group2, group3, group4, group5, group6, group7, group8, group9, group10, group11, group12, group13, group14,
group15, group16, group17, group18, group19, group20, group21, group22, group23, group24, group25, group26, group27, group28, group29, group30, group31 = &api.AffinityGroupSpec{
group15, group16, group17, group18, group19, group20, group21, group22, group23, group24, group25, group26, group27,
group28, group29, group30, group31, group32, group33, group34 = &api.AffinityGroupSpec{
Name: "group1",
Members: []api.AffinityGroupMemberSpec{{PodNumber: 1, GpuNumber: 1}},
}, &api.AffinityGroupSpec{
Expand Down Expand Up @@ -156,6 +158,15 @@ var group1, group2, group3, group4, group5, group6, group7, group8, group9, grou
}, &api.AffinityGroupSpec{
Name: "group31",
Members: []api.AffinityGroupMemberSpec{{PodNumber: 1, GpuNumber: 16}},
}, &api.AffinityGroupSpec{
Name: "group32",
Members: []api.AffinityGroupMemberSpec{{PodNumber: 1, GpuNumber: 16}},
}, &api.AffinityGroupSpec{
Name: "group33",
Members: []api.AffinityGroupMemberSpec{{PodNumber: 1, GpuNumber: 16}},
}, &api.AffinityGroupSpec{
Name: "group34",
Members: []api.AffinityGroupMemberSpec{{PodNumber: 1, GpuNumber: 16}},
}

var pss = map[types.UID]api.PodSchedulingSpec{
Expand Down Expand Up @@ -503,6 +514,30 @@ var pss = map[types.UID]api.PodSchedulingSpec{
GpuType: "DGX2-V100",
GpuNumber: 16,
AffinityGroup: group31,
}, "pod44": { // safe relaxed buddy allocate for bad node test
VirtualCluster: "VC1",
Priority: 0,
LazyPreemptionEnable: true,
PinnedCellId: "",
GpuType: "DGX2-V100",
GpuNumber: 16,
AffinityGroup: group32,
}, "pod45": { // safe relaxed buddy allocate for bad node test
VirtualCluster: "VC1",
Priority: 0,
LazyPreemptionEnable: true,
PinnedCellId: "",
GpuType: "DGX2-V100",
GpuNumber: 16,
AffinityGroup: group33,
}, "pod46": { // safe relaxed buddy allocate safety test
VirtualCluster: "VC1",
Priority: 0,
LazyPreemptionEnable: true,
PinnedCellId: "",
GpuType: "DGX2-V100",
GpuNumber: 16,
AffinityGroup: group34,
},
}

Expand Down Expand Up @@ -552,6 +587,8 @@ var expectedBindInfos = map[string]result{
"pod38": {node: "0.0.3.1", gpuIsolation: []int32{0}},
"pod39": {node: "0.0.3.2", gpuIsolation: []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}},
"pod40": {node: "0.0.4.3", gpuIsolation: []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}},
"pod44": {node: "0.0.3.2", gpuIsolation: []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}},
"pod45": {node: "0.0.4.2", gpuIsolation: []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}},
}

var expectedPreemptInfos = map[string]common.Set{
Expand Down Expand Up @@ -589,6 +626,7 @@ func TestHivedAlgorithm(t *testing.T) {
testSuggestedNodes(t, configFilePath)
testStatefulPreemption(t, configFilePath)
testBadNodes(t, configFilePath)
testSafeRelaxedBuddyAlloc(t, configFilePath)
testReconfiguration(t, configFilePath)
testInvalidInitialAssignment(t, sConfig)
}
Expand Down Expand Up @@ -960,6 +998,47 @@ func testBadNodes(t *testing.T, configFilePath string) {
}
}

func testSafeRelaxedBuddyAlloc(t *testing.T, configFilePath string) {
sConfig := api.NewConfig(api.InitRawConfig(&configFilePath))
(*sConfig.VirtualClusters)["VC1"].VirtualCells[0].CellNumber = 4
(*sConfig.VirtualClusters)["VC1"].VirtualCells[2].CellNumber = 0
(*sConfig.VirtualClusters)["VC1"].VirtualCells[3].CellNumber = 0
(*sConfig.VirtualClusters)["VC2"].VirtualCells[2].CellType = "4-DGX2-V100-NODE.2-DGX2-V100-NODE"
(*sConfig.VirtualClusters)["VC2"].VirtualCells[2].CellNumber = 1
h := NewHivedAlgorithm(sConfig)
for _, chains := range h.cellChains {
sortChains(chains)
}
setHealthyNodes(h)
allocatedPods = []*core.Pod{}

pod := allPods["pod44"]
pod.Annotations[api.AnnotationKeyPodSchedulingSpec] = common.ToYaml(pss[pod.UID])
psr := h.Schedule(pod, []string{"0.0.3.2", "0.0.3.3", "0.0.4.2", "0.0.4.3"}, internal.PreemptingPhase)
bindingPod := internal.NewBindingPod(pod, psr.PodBindInfo)
h.AddAllocatedPod(bindingPod)
allocatedPods = append(allocatedPods, bindingPod)
compareSchedulingResult(t, pod, psr)

h.setBadNode("0.0.3.3")
pod = allPods["pod45"]
pod.Annotations[api.AnnotationKeyPodSchedulingSpec] = common.ToYaml(pss[pod.UID])
psr = h.Schedule(pod, []string{"0.0.3.2", "0.0.3.3", "0.0.4.2", "0.0.4.3"}, internal.PreemptingPhase)
if psr.PodBindInfo == nil {
t.Errorf("Cannot split higher level cells when requested level cell is bad")
}
bindingPod = internal.NewBindingPod(pod, psr.PodBindInfo)
h.AddAllocatedPod(bindingPod)
allocatedPods = append(allocatedPods, bindingPod)
compareSchedulingResult(t, pod, psr)

h.setBadNode("0.0.4.3")
pod = allPods["pod46"]
pod.Annotations[api.AnnotationKeyPodSchedulingSpec] = common.ToYaml(pss[pod.UID])
psr = h.Schedule(pod, []string{"0.0.3.2", "0.0.3.3", "0.0.4.0", "0.0.4.1", "0.0.4.2", "0.0.4.3"}, internal.PreemptingPhase)
compareSchedulingResult(t, pod, psr)
}

func testReconfiguration(t *testing.T, configFilePath string) {
oldConfig := api.NewConfig(api.InitRawConfig(&configFilePath))
h := NewHivedAlgorithm(oldConfig)
Expand Down
6 changes: 5 additions & 1 deletion pkg/algorithm/topology_aware_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@ package algorithm

import (
"fmt"
"sort"

"github.com/microsoft/hivedscheduler/pkg/api"
"github.com/microsoft/hivedscheduler/pkg/common"
"sort"
)

// topologyAwareScheduler can schedule a set of pods on a cluster view.
Expand Down Expand Up @@ -433,6 +434,9 @@ func removePickedGpus(gpus CellList, indices []int32) CellList {
copy(gpus[index-offset:], gpus[index+1:])
}
}
for i := len(gpus) - len(indices); i < len(gpus); i++ {
gpus[i] = nil
}
return gpus[:len(gpus)-len(indices)]
}

Expand Down

0 comments on commit 76ed604

Please sign in to comment.