Skip to content

Commit

Permalink
[Bug] All worker Pods are deleted if using KubeRay v1.0.0 CRD with Ku…
Browse files Browse the repository at this point in the history
…beRay operator v1.1.0 image (#2087)
  • Loading branch information
kevin85421 committed Apr 18, 2024
1 parent 791ea37 commit 20636f9
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 1 deletion.
7 changes: 6 additions & 1 deletion ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -796,10 +796,15 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
}
}
// A replica can contain multiple hosts, so we need to calculate this based on the number of hosts per replica.
// If the user doesn't install the CRD with `NumOfHosts`, the zero value of `NumOfHosts`, which is 0, will be used.
// Hence, all workers will be deleted. Here, we set `NumOfHosts` to max(1, `NumOfHosts`) to avoid this situation.
if worker.NumOfHosts <= 0 {
worker.NumOfHosts = 1
}
numExpectedPods := workerReplicas * worker.NumOfHosts
diff := numExpectedPods - int32(len(runningPods.Items))

logger.Info("reconcilePods", "workerReplicas", workerReplicas, "runningPods", len(runningPods.Items), "diff", diff)
logger.Info("reconcilePods", "workerReplicas", workerReplicas, "NumOfHosts", worker.NumOfHosts, "runningPods", len(runningPods.Items), "diff", diff)

if diff > 0 {
// pods need to be added
Expand Down
39 changes: 39 additions & 0 deletions ray-operator/controllers/ray/raycluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -686,4 +686,43 @@ var _ = Context("Inside the default namespace", func() {
Expect(rayCluster.Status.DesiredCPU).To(Equal(desiredCPU))
})
})

Describe("RayCluster with invalid NumOfHosts", func() {
// Some users only upgrade the KubeRay image without upgrading the CRD. For example, when a
// user upgrades the KubeRay operator from v1.0.0 to v1.1.0 without upgrading the CRD, the
// KubeRay operator will use the zero value of `NumOfHosts` in the CRD. Hence, all worker
// Pods will be deleted. This test case is designed to prevent Pods from being deleted.
ctx := context.Background()
namespace := "default"
rayCluster := rayClusterTemplate("raycluster-invalid-numofhosts", namespace)
numOfHosts := int32(0)
rayCluster.Spec.WorkerGroupSpecs[0].NumOfHosts = numOfHosts
workerPods := corev1.PodList{}
workerFilters := common.RayClusterGroupPodsAssociationOptions(rayCluster, rayCluster.Spec.WorkerGroupSpecs[0].GroupName).ToListOptions()

It("Verify RayCluster spec", func() {
// These test are designed based on the following assumptions:
// (1) There is only one worker group, and its `replicas` is set to 3, and `workersToDelete` is empty.
// (2) The worker group has an invalid `numOfHosts` value of 0.
Expect(len(rayCluster.Spec.WorkerGroupSpecs)).To(Equal(1))
Expect(rayCluster.Spec.WorkerGroupSpecs[0].NumOfHosts).To(Equal(numOfHosts))
Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(pointer.Int32(3)))
Expect(rayCluster.Spec.WorkerGroupSpecs[0].ScaleStrategy.WorkersToDelete).To(BeEmpty())
})

It("Create a RayCluster custom resource", func() {
err := k8sClient.Create(ctx, rayCluster)
Expect(err).NotTo(HaveOccurred(), "Failed to create RayCluster")
Eventually(
getResourceFunc(ctx, client.ObjectKey{Name: rayCluster.Name, Namespace: namespace}, rayCluster),
time.Second*3, time.Millisecond*500).Should(BeNil(), "Should be able to see RayCluster: %v", rayCluster.Name)
})

It("Check the number of worker Pods", func() {
numWorkerPods := 3 * int(numOfHosts)
Eventually(
listResourceFunc(ctx, &workerPods, workerFilters...),
time.Second*3, time.Millisecond*500).Should(Equal(numWorkerPods), fmt.Sprintf("workerGroup %v", workerPods.Items))
})
})
})

0 comments on commit 20636f9

Please sign in to comment.