From deebfab808c8ba38954c7dc1f92e8f91be879469 Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Thu, 8 Aug 2024 02:11:08 +0900 Subject: [PATCH 1/6] Fix formatting issues Signed-off-by: Yuki Iwai --- docs/proposals/2170-kubeflow-training-v2/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/proposals/2170-kubeflow-training-v2/README.md b/docs/proposals/2170-kubeflow-training-v2/README.md index cc1aa4deb4..9a06e3a7af 100644 --- a/docs/proposals/2170-kubeflow-training-v2/README.md +++ b/docs/proposals/2170-kubeflow-training-v2/README.md @@ -321,9 +321,9 @@ type TrainingRuntimeRef struct { type TrainJobStatus struct { // Conditions for the TrainJob. Initially, it will have the same conditions as JobSet. Conditions []metav1.Condition `json:"conditions,omitempty"` - - // ReplicatedJobsStatus track the number of Jobs for each replicatedJob in JobSet. - ReplicatedJobsStatus []ReplicatedJobStatus `json:"replicatedJobsStatus,omitempty"` + + // ReplicatedJobsStatus track the number of Jobs for each replicatedJob in JobSet. + ReplicatedJobsStatus []ReplicatedJobStatus `json:"replicatedJobsStatus,omitempty"` } type ReplicatedJobStatus struct { From a4872f84a5e37af76cc2535d4f52af17e51d69df Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Thu, 8 Aug 2024 03:59:10 +0900 Subject: [PATCH 2/6] Make trainingRuntimeRef more clarify Signed-off-by: Yuki Iwai --- .../2170-kubeflow-training-v2/README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/proposals/2170-kubeflow-training-v2/README.md b/docs/proposals/2170-kubeflow-training-v2/README.md index 9a06e3a7af..eb980bfe5f 100644 --- a/docs/proposals/2170-kubeflow-training-v2/README.md +++ b/docs/proposals/2170-kubeflow-training-v2/README.md @@ -305,17 +305,18 @@ type TrainJobSpec struct { } type TrainingRuntimeRef struct { - // Name for the training runtime. + // Name for the runtime. + // This must indicate the runtime deployed in the same namespace as the TrainJob + // when TrainingRuntime is used in the kind. Name string `json:"name"` + + // APIVersion is the apiVersion for the runtime. + // Defaults to the v2alpha1. + APIVersion *string `json:apiVersion,omitempty` - // Namespace for the runtime. In that case, user should use TrainingRuntime - Namespace *string `json:"namespace,omitempty"` - - // Kind for the runtime. TrainingRuntime or ClusterTrainingRuntime + // Kind for the runtime, which must be TrainingRuntime or ClusterTrainingRuntime. + // Defaults to the TrainingRuntime. Kind *string `json:"kind,omitempty"` - - // Version for the runtime. For example: v2alpha1 - Version *string `json:"version,omitempty"` } type TrainJobStatus struct { From 82d6fe581bacb0b4961b527f00385ece8fc1311c Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Thu, 8 Aug 2024 04:37:06 +0900 Subject: [PATCH 3/6] Update the managedBy specifications Signed-off-by: Yuki Iwai --- .../2170-kubeflow-training-v2/README.md | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/docs/proposals/2170-kubeflow-training-v2/README.md b/docs/proposals/2170-kubeflow-training-v2/README.md index eb980bfe5f..14aaa0a33d 100644 --- a/docs/proposals/2170-kubeflow-training-v2/README.md +++ b/docs/proposals/2170-kubeflow-training-v2/README.md @@ -301,6 +301,18 @@ type TrainJobSpec struct { Suspend *bool `json:"suspend,omitempty"` // ManagedBy is used to indicate the controller or entity that manages a TrainJob. + // The value must be either an empty, 'training-operator.kubeflow.org/trainjob-controller' or + // 'kueue.x-k8s.io/multikueue'. + // The built-in TrainJob controller reconciles TrainJob which don't have this + // field at all or the field value is the reserved string + // 'training-operator.kubeflow.org/trainjob-controller', but delegates reconciling TrainJobs + // with a 'kueue.x-k8s.io/multikueue' to the Kueue. + // + // The value must be a valid domain-prefixed path (e.g. acme.io/foo) - + // all characters before the first "/" must be a valid subdomain as defined + // by RFC 1123. All characters trailing the first "/" must be valid HTTP Path + // characters as defined by RFC 3986. The value cannot exceed 63 characters. + // The field is immutable. ManagedBy *string `json:"managedBy,omitempty"` } @@ -309,7 +321,7 @@ type TrainingRuntimeRef struct { // This must indicate the runtime deployed in the same namespace as the TrainJob // when TrainingRuntime is used in the kind. Name string `json:"name"` - + // APIVersion is the apiVersion for the runtime. // Defaults to the v2alpha1. APIVersion *string `json:apiVersion,omitempty` @@ -322,7 +334,7 @@ type TrainingRuntimeRef struct { type TrainJobStatus struct { // Conditions for the TrainJob. Initially, it will have the same conditions as JobSet. Conditions []metav1.Condition `json:"conditions,omitempty"` - + // ReplicatedJobsStatus track the number of Jobs for each replicatedJob in JobSet. ReplicatedJobsStatus []ReplicatedJobStatus `json:"replicatedJobsStatus,omitempty"` } @@ -1591,3 +1603,17 @@ framework that users want to run on Kubernetes. Since frameworks share common functionality for distributed training (data parallelizm or model parallelizm). For some specific use-cases like MPI or Elastic PyTorch, we will leverage `MLSpec` parameter. + +### Allow users to specify arbitrary value in the managedBy field + +We can allow users to specify the arbitrary values instead of restricting the `.spec.managedBy` field in the TrainJob +with an empty, 'training-operator.kubeflow.org/trainjob-controller' or 'kusus.x-k8s.io/multikueue'. + +But, the arbitrary values allow users to specify external or in-house customized training-operator, which means that +the TrainJobs are reconciled by the controllers without any specification compliance. + +Specifically, the arbitrary training-operator could bring bugs for the status transitions. +So, we do not support the arbitrary values until we find reasonable use cases that the external controllers +need to reconcile the TrainJob. + +Note that we should implement the status transitions validations to once we support the arbitrary values in the `manageBy` field. From 530b33ea59ae5557366c50883f03feff2806d5f9 Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Fri, 9 Aug 2024 01:20:46 +0900 Subject: [PATCH 4/6] Use 'kubeflow.org/trainjob-controller' instead of 'training-operator.kubeflow.org/trainjob-controller' Signed-off-by: Yuki Iwai --- docs/proposals/2170-kubeflow-training-v2/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/proposals/2170-kubeflow-training-v2/README.md b/docs/proposals/2170-kubeflow-training-v2/README.md index 14aaa0a33d..35bce7dedc 100644 --- a/docs/proposals/2170-kubeflow-training-v2/README.md +++ b/docs/proposals/2170-kubeflow-training-v2/README.md @@ -301,11 +301,11 @@ type TrainJobSpec struct { Suspend *bool `json:"suspend,omitempty"` // ManagedBy is used to indicate the controller or entity that manages a TrainJob. - // The value must be either an empty, 'training-operator.kubeflow.org/trainjob-controller' or + // The value must be either an empty, 'kubeflow.org/trainjob-controller' or // 'kueue.x-k8s.io/multikueue'. // The built-in TrainJob controller reconciles TrainJob which don't have this // field at all or the field value is the reserved string - // 'training-operator.kubeflow.org/trainjob-controller', but delegates reconciling TrainJobs + // 'kubeflow.org/trainjob-controller', but delegates reconciling TrainJobs // with a 'kueue.x-k8s.io/multikueue' to the Kueue. // // The value must be a valid domain-prefixed path (e.g. acme.io/foo) - @@ -1607,7 +1607,7 @@ model parallelizm). For some specific use-cases like MPI or Elastic PyTorch, we ### Allow users to specify arbitrary value in the managedBy field We can allow users to specify the arbitrary values instead of restricting the `.spec.managedBy` field in the TrainJob -with an empty, 'training-operator.kubeflow.org/trainjob-controller' or 'kusus.x-k8s.io/multikueue'. +with an empty, 'kubeflow.org/trainjob-controller' or 'kusus.x-k8s.io/multikueue'. But, the arbitrary values allow users to specify external or in-house customized training-operator, which means that the TrainJobs are reconciled by the controllers without any specification compliance. From 153ea1d938049c52c1064cb8def62a1598335687 Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Fri, 9 Aug 2024 01:22:17 +0900 Subject: [PATCH 5/6] The ClusterTrainingRuntime is used in the runtimeRef as a default Signed-off-by: Yuki Iwai --- docs/proposals/2170-kubeflow-training-v2/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/proposals/2170-kubeflow-training-v2/README.md b/docs/proposals/2170-kubeflow-training-v2/README.md index 35bce7dedc..5b9d38d9b3 100644 --- a/docs/proposals/2170-kubeflow-training-v2/README.md +++ b/docs/proposals/2170-kubeflow-training-v2/README.md @@ -327,7 +327,7 @@ type TrainingRuntimeRef struct { APIVersion *string `json:apiVersion,omitempty` // Kind for the runtime, which must be TrainingRuntime or ClusterTrainingRuntime. - // Defaults to the TrainingRuntime. + // Defaults to the ClusterTrainingRuntime. Kind *string `json:"kind,omitempty"` } From 43edaef37a119cfbf635fff5c241c3d4b00be00f Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Fri, 9 Aug 2024 03:28:08 +0900 Subject: [PATCH 6/6] Move apiVersion for the TrainingRuntime to alternative section Signed-off-by: Yuki Iwai --- .../2170-kubeflow-training-v2/README.md | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/docs/proposals/2170-kubeflow-training-v2/README.md b/docs/proposals/2170-kubeflow-training-v2/README.md index 5b9d38d9b3..a13fe5f067 100644 --- a/docs/proposals/2170-kubeflow-training-v2/README.md +++ b/docs/proposals/2170-kubeflow-training-v2/README.md @@ -322,10 +322,6 @@ type TrainingRuntimeRef struct { // when TrainingRuntime is used in the kind. Name string `json:"name"` - // APIVersion is the apiVersion for the runtime. - // Defaults to the v2alpha1. - APIVersion *string `json:apiVersion,omitempty` - // Kind for the runtime, which must be TrainingRuntime or ClusterTrainingRuntime. // Defaults to the ClusterTrainingRuntime. Kind *string `json:"kind,omitempty"` @@ -1617,3 +1613,28 @@ So, we do not support the arbitrary values until we find reasonable use cases th need to reconcile the TrainJob. Note that we should implement the status transitions validations to once we support the arbitrary values in the `manageBy` field. + +### Support Multiple API Versions of TrainingRuntime + +We can consider to introduce the `version` field for runtime API version to the `.spec.trainingRuntimeRef` +so that we can support multiple API versions of TrainingRuntime. + +It could mitigate the pain points when users upgrade the older API Version to newer API Version like alpha to beta. +But, we do not aim to support both Alpha and Beta versions or both first Alpha and second Alpha versions in the specific training-operator release. +Hence, the `version` field was not introduced. + +```go +type TrainingRuntimeRef struct { + [...] + + // APIVersion is the apiVersion for the runtime. + // Defaults to the v2alpha1. + Version *string `json:version,omitempty` + + [...] +} +``` + +However, we may want to revisit this discussion when we graduate the API version from Beta to GA +because in general, it would be better to support both Beta and GA versions for a while +so that we can align with the Kubernetes deprecation policy for mitigating migration obstacles.