Skip to content

Commit

Permalink
User longer exec probe timeouts for Head pods (#2353)
Browse files Browse the repository at this point in the history
Signed-off-by: Andrew Sy Kim <andrewsy@google.com>
  • Loading branch information
andrewsykim authored Sep 10, 2024
1 parent fb7a486 commit 6cbb5df
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 10 deletions.
13 changes: 11 additions & 2 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,9 +264,14 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r
}

if rayContainer.LivenessProbe == nil {
probeTimeout := utils.DefaultLivenessProbeTimeoutSeconds
if rayNodeType == rayv1.HeadNode {
probeTimeout = utils.DefaultHeadLivenessProbeTimeoutSeconds
}

rayContainer.LivenessProbe = &corev1.Probe{
InitialDelaySeconds: utils.DefaultLivenessProbeInitialDelaySeconds,
TimeoutSeconds: utils.DefaultLivenessProbeTimeoutSeconds,
TimeoutSeconds: int32(probeTimeout),
PeriodSeconds: utils.DefaultLivenessProbePeriodSeconds,
SuccessThreshold: utils.DefaultLivenessProbeSuccessThreshold,
FailureThreshold: utils.DefaultLivenessProbeFailureThreshold,
Expand All @@ -275,9 +280,13 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r
}

if rayContainer.ReadinessProbe == nil {
probeTimeout := utils.DefaultReadinessProbeTimeoutSeconds
if rayNodeType == rayv1.HeadNode {
probeTimeout = utils.DefaultHeadReadinessProbeTimeoutSeconds
}
rayContainer.ReadinessProbe = &corev1.Probe{
InitialDelaySeconds: utils.DefaultReadinessProbeInitialDelaySeconds,
TimeoutSeconds: utils.DefaultReadinessProbeTimeoutSeconds,
TimeoutSeconds: int32(probeTimeout),
PeriodSeconds: utils.DefaultReadinessProbePeriodSeconds,
SuccessThreshold: utils.DefaultReadinessProbeSuccessThreshold,
FailureThreshold: utils.DefaultReadinessProbeFailureThreshold,
Expand Down
18 changes: 17 additions & 1 deletion ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1128,7 +1128,7 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
assert.Nil(t, rayContainer.LivenessProbe.Exec)
assert.Nil(t, rayContainer.ReadinessProbe.Exec)

// Test 2: User does not define a custom probe. KubeRay will inject Exec probe.
// Test 2: User does not define a custom probe. KubeRay will inject Exec probe for worker pod.
// Here we test the case where the Ray Pod originates from RayServiceCRD,
// implying that an additional serve health check will be added to the readiness probe.
rayContainer.LivenessProbe = nil
Expand All @@ -1138,4 +1138,20 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.True(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds)

// Test 3: User does not define a custom probe. KubeRay will inject Exec probe for head pod.
// Here we test the case where the Ray Pod originates from RayServiceCRD,
// implying that an additional serve health check will be added to the readiness probe.
rayContainer.LivenessProbe = nil
rayContainer.ReadinessProbe = nil
initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayServiceCRD)
assert.NotNil(t, rayContainer.LivenessProbe.Exec)
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
// head pod should not have Ray Serve proxy health probes
assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.False(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.Equal(t, int32(5), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(5), rayContainer.ReadinessProbe.TimeoutSeconds)
}
18 changes: 11 additions & 7 deletions ray-operator/controllers/ray/utils/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,17 +150,21 @@ const (
// Ray FT default readiness probe values
DefaultReadinessProbeInitialDelaySeconds = 10
DefaultReadinessProbeTimeoutSeconds = 2
DefaultReadinessProbePeriodSeconds = 5
DefaultReadinessProbeSuccessThreshold = 1
DefaultReadinessProbeFailureThreshold = 10
ServeReadinessProbeFailureThreshold = 1
// Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz)
DefaultHeadReadinessProbeTimeoutSeconds = 5
DefaultReadinessProbePeriodSeconds = 5
DefaultReadinessProbeSuccessThreshold = 1
DefaultReadinessProbeFailureThreshold = 10
ServeReadinessProbeFailureThreshold = 1

// Ray FT default liveness probe values
DefaultLivenessProbeInitialDelaySeconds = 30
DefaultLivenessProbeTimeoutSeconds = 2
DefaultLivenessProbePeriodSeconds = 5
DefaultLivenessProbeSuccessThreshold = 1
DefaultLivenessProbeFailureThreshold = 120
// Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz)
DefaultHeadLivenessProbeTimeoutSeconds = 5
DefaultLivenessProbePeriodSeconds = 5
DefaultLivenessProbeSuccessThreshold = 1
DefaultLivenessProbeFailureThreshold = 120

// Ray health check related configurations
// Note: Since the Raylet process and the dashboard agent process are fate-sharing,
Expand Down

0 comments on commit 6cbb5df

Please sign in to comment.