Skip to content

Commit

Permalink
Allow provisioner to be configured to force on-demand nodes & disable…
Browse files Browse the repository at this point in the history
… auto-upgrade (#656)

* Allow provisioner to be configured to force on-demand nodes

* Disable auto-upgrade on node pools
  • Loading branch information
nstogner authored May 6, 2024
1 parent 5144816 commit a4ee38a
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 24 deletions.
5 changes: 5 additions & 0 deletions tpu-provisioner/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ func main() {
GCPNodeSecondaryDisk string `envconfig:"GCP_NODE_SECONDARY_DISK" default:""`
GCPNodeSecureBoot bool `envconfig:"GCP_NODE_SECURE_BOOT" default:"true"`

// GCPForceOnDemand forces the controller to create nodes on demand, even if
// the Pod requests a reservation or spot.
GCPForceOnDemand bool `envconfig:"GCP_FORCE_ON_DEMAND" default:"false"`

// NodeMinLifespan is the amount of time that should pass between a Node object
// creation and a cleanup of that Node. This needs to be long enough to allow
// the node to become Ready and for a pending Pod to be scheduled on it.
Expand Down Expand Up @@ -203,6 +207,7 @@ func main() {
NodeSecondaryDisk: cfg.GCPNodeSecondaryDisk,
NodeTags: cfg.GCPNodeTags,
NodeSecureBoot: cfg.GCPNodeSecureBoot,
ForceOnDemand: cfg.GCPForceOnDemand,
},
Recorder: mgr.GetEventRecorderFor("tpu-provisioner"),
}
Expand Down
43 changes: 23 additions & 20 deletions tpu-provisioner/internal/cloud/gke.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,27 +276,30 @@ func (g *GKE) nodePoolForPod(name string, p *corev1.Pod) (*containerv1beta1.Node
}

var reservation *containerv1beta1.ReservationAffinity
if resName, ok := p.Spec.NodeSelector["cloud.google.com/reservation-name"]; ok {
reservation = &containerv1beta1.ReservationAffinity{
ConsumeReservationType: "SPECIFIC_RESERVATION",
Key: "compute.googleapis.com/reservation-name",
Values: []string{
resName,
},
}
}

var taints []*containerv1beta1.NodeTaint
var spot bool

if !g.ClusterContext.ForceOnDemand {
if resName, ok := p.Spec.NodeSelector["cloud.google.com/reservation-name"]; ok {
reservation = &containerv1beta1.ReservationAffinity{
ConsumeReservationType: "SPECIFIC_RESERVATION",
Key: "compute.googleapis.com/reservation-name",
Values: []string{
resName,
},
}
}

spot := p.Spec.NodeSelector["cloud.google.com/gke-spot"] == "true"
if spot {
// Add the taint that NAP would add.
// https://cloud.google.com/kubernetes-engine/docs/concepts/spot-vms#spotvms-nap
taints = append(taints, &containerv1beta1.NodeTaint{
Key: "cloud.google.com/gke-spot",
Value: "true",
Effect: "NO_SCHEDULE",
})
spot = p.Spec.NodeSelector["cloud.google.com/gke-spot"] == "true"
if spot {
// Add the taint that NAP would add.
// https://cloud.google.com/kubernetes-engine/docs/concepts/spot-vms#spotvms-nap
taints = append(taints, &containerv1beta1.NodeTaint{
Key: "cloud.google.com/gke-spot",
Value: "true",
Effect: "NO_SCHEDULE",
})
}
}

var secondaryDisks []*containerv1beta1.SecondaryBootDisk
Expand Down Expand Up @@ -336,7 +339,7 @@ func (g *GKE) nodePoolForPod(name string, p *corev1.Pod) (*containerv1beta1.Node
},
Management: &containerv1beta1.NodeManagement{
AutoRepair: true,
AutoUpgrade: true,
AutoUpgrade: false,
},
UpgradeSettings: &containerv1beta1.UpgradeSettings{
MaxSurge: 1,
Expand Down
1 change: 1 addition & 0 deletions tpu-provisioner/internal/cloud/gke_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ type GKEContext struct {
NodeSecondaryDisk string
NodeTags []string
NodeSecureBoot bool
ForceOnDemand bool
}

func (c GKEContext) ClusterName() string {
Expand Down
95 changes: 91 additions & 4 deletions tpu-provisioner/internal/cloud/gke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,67 @@ func TestNodePoolForPod(t *testing.T) {
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
UpgradeSettings: &container.UpgradeSettings{MaxSurge: 1},
},
},
{
desc: "spot",
selector: map[string]string{
"cloud.google.com/gke-spot": "true",
},
want: &containerv1beta1.NodePool{
Config: &container.NodeConfig{
Labels: map[string]string{
"google.com/nodepool-manager": "tpu-provisioner",
"google.com/tpu-provisioner-jobset-name": "jobset-test",
"google.com/tpu-provisioner-jobset-namespace": "default",
"google.com/tpu-provisioner-parent-kind": "job",
"google.com/tpu-provisioner-parent-name": "jobset-test-job-1-0",
"google.com/tpu-provisioner-parent-namespace": "default",
},
MachineType: "ct5p-hightpu-4t",
ShieldedInstanceConfig: &container.ShieldedInstanceConfig{EnableIntegrityMonitoring: true},
Spot: true,
Taints: []*container.NodeTaint{
{Effect: "NO_SCHEDULE", Key: "cloud.google.com/gke-spot", Value: "true"},
},
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
UpgradeSettings: &container.UpgradeSettings{MaxSurge: 1},
},
},
{
desc: "spot with forced on demand",
gkeContext: GKEContext{ForceOnDemand: true},
selector: map[string]string{
"cloud.google.com/gke-spot": "true",
},
want: &containerv1beta1.NodePool{
Config: &container.NodeConfig{
Labels: map[string]string{
"google.com/nodepool-manager": "tpu-provisioner",
"google.com/tpu-provisioner-jobset-name": "jobset-test",
"google.com/tpu-provisioner-jobset-namespace": "default",
"google.com/tpu-provisioner-parent-kind": "job",
"google.com/tpu-provisioner-parent-name": "jobset-test-job-1-0",
"google.com/tpu-provisioner-parent-namespace": "default",
},
MachineType: "ct5p-hightpu-4t",
ShieldedInstanceConfig: &container.ShieldedInstanceConfig{EnableIntegrityMonitoring: true},
Spot: false,
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
Expand Down Expand Up @@ -272,7 +332,34 @@ func TestNodePoolForPod(t *testing.T) {
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
UpgradeSettings: &container.UpgradeSettings{MaxSurge: 1},
},
},
{
desc: "pod with reservation selector but on demand is forced",
selector: map[string]string{"cloud.google.com/reservation-name": "tpu-rsv"},
gkeContext: GKEContext{ForceOnDemand: true},
want: &containerv1beta1.NodePool{
Config: &container.NodeConfig{
Labels: map[string]string{
"google.com/nodepool-manager": "tpu-provisioner",
"google.com/tpu-provisioner-jobset-name": "jobset-test",
"google.com/tpu-provisioner-jobset-namespace": "default",
"google.com/tpu-provisioner-parent-kind": "job",
"google.com/tpu-provisioner-parent-name": "jobset-test-job-1-0",
"google.com/tpu-provisioner-parent-namespace": "default",
},
MachineType: "ct5p-hightpu-4t",
ReservationAffinity: nil,
ShieldedInstanceConfig: &container.ShieldedInstanceConfig{EnableIntegrityMonitoring: true},
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
Expand All @@ -298,7 +385,7 @@ func TestNodePoolForPod(t *testing.T) {
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
Expand Down Expand Up @@ -329,7 +416,7 @@ func TestNodePoolForPod(t *testing.T) {
},
InitialNodeCount: 512,
Locations: []string{""},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
Name: "test-pool",
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
Expand Down

0 comments on commit a4ee38a

Please sign in to comment.