Skip to content

Commit

Permalink
Update PodSpecOverride API
Browse files Browse the repository at this point in the history
Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
  • Loading branch information
andreyvelich committed Aug 30, 2024
1 parent a578a31 commit 273af3c
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 33 deletions.
29 changes: 14 additions & 15 deletions docs/proposals/2170-kubeflow-training-v2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -301,20 +301,19 @@ type TrainJobSpec struct {
Annotations map[string]string `json:"annotations,omitempty"`

// Custom overrides for the training runtime.
PodSpecOverrides []PodSpecOverrides `json:"podSpecOverrides,omitempty"`
PodSpecOverrides []PodSpecOverride `json:"podSpecOverrides,omitempty"`

// Whether the controller should suspend the running TrainJob.
// Defaults to false.
Suspend *bool `json:"suspend,omitempty"`

// ManagedBy is used to indicate the controller or entity that manages a TrainJob.
// The value must be either an empty, 'kubeflow.org/trainjob-controller' or
// 'kueue.x-k8s.io/multikueue'.
// The built-in TrainJob controller reconciles TrainJob which don't have this
// field at all or the field value is the reserved string
// 'kubeflow.org/trainjob-controller', but delegates reconciling TrainJobs
// with a 'kueue.x-k8s.io/multikueue' to the Kueue.
// The field is immutable.
// The value must be either an empty, `kubeflow.org/trainjob-controller` or
// `kueue.x-k8s.io/multikueue`. The built-in TrainJob controller reconciles TrainJob which
// don't have this field at all or the field value is the reserved string
// `kubeflow.org/trainjob-controller`, but delegates reconciling TrainJobs
// with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable.
// Defaults to `kubeflow.org/trainjob-controller`
ManagedBy *string `json:"managedBy,omitempty"`
}

Expand Down Expand Up @@ -820,9 +819,9 @@ spec:
claimName: model-exporter
```

### The PodSpecOverrides APIs
### The PodSpecOverride APIs

The `PodSpecOverrides` represents overrides for the `TrainingRuntime` when `TrainJob` is created.
The `PodSpecOverride` represents overrides for the `TrainingRuntime` when `TrainJob` is created.
These parameters can include the user's identity or PVC.

Usually, these parameters should not be configured by the user and should be attached during the
Expand All @@ -831,15 +830,15 @@ orchestration (e.g. using Kubernetes admission webhooks or custom clients).
In the future, we can add more parameters if we find use-cases when it is required.

```golang
type PodSpecOverrides struct {
type PodSpecOverride struct {
// Names of the training job replicas in the training runtime template to apply the overrides.
TargetReplicatedJobs []string `json:"targetReplicatedJobs"`

// Overrides for the containers in the desired job templates.
Containers []ContainerOverrides `json:"containers,omitempty"`
Containers []ContainerOverride `json:"containers,omitempty"`

// Overrides for the init container in the desired job templates.
InitContainers []ContainerOverrides `json:"initContainers,omitempty"`
InitContainers []ContainerOverride `json:"initContainers,omitempty"`

// Overrides for the Pod volume configuration.
Volumes []corev1.Volume `json:"volumes,omitempty"`
Expand All @@ -854,9 +853,9 @@ type PodSpecOverrides struct {
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
}

// ContainerOverrides represents parameters that can be overridden using PodSpecOverrides.
// ContainerOverride represents parameters that can be overridden using PodSpecOverride.
// Parameters from the Trainer, DatasetConfig, and ModelConfig will take precedence.
type ContainerOverrides struct {
type ContainerOverride struct {
// Name for the container. TrainingRuntime must have this container.
Name string `json:"name"`

Expand Down
20 changes: 13 additions & 7 deletions pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,19 @@ type TrainJobSpec struct {
Annotations map[string]string `json:"annotations,omitempty"`

// Custom overrides for the training runtime.
PodSpecOverrides []PodSpecOverrides `json:"podSpecOverrides,omitempty"`
PodSpecOverrides []PodSpecOverride `json:"podSpecOverrides,omitempty"`

// Whether the controller should suspend the running TrainJob.
// Defaults to false.
Suspend *bool `json:"suspend,omitempty"`

// ManagedBy field indicates the controller that manages a TrainJob.
// ManagedBy is used to indicate the controller or entity that manages a TrainJob.
// The value must be either an empty, `kubeflow.org/trainjob-controller` or
// `kueue.x-k8s.io/multikueue`. The built-in TrainJob controller reconciles TrainJob which
// don't have this field at all or the field value is the reserved string
// `kubeflow.org/trainjob-controller`, but delegates reconciling TrainJobs
// with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable.
// Defaults to `kubeflow.org/trainjob-controller`
ManagedBy *string `json:"managedBy,omitempty"`
}

Expand Down Expand Up @@ -187,16 +193,16 @@ type OutputModel struct {
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
}

// PodSpecOverrides represents the custom overrides that will be applied for the TrainJob's resources.
type PodSpecOverrides struct {
// PodSpecOverride represents the custom overrides that will be applied for the TrainJob's resources.
type PodSpecOverride struct {
// Names of the training job replicas in the training runtime template to apply the overrides.
TargetReplicatedJobs []string `json:"targetReplicatedJobs"`

// Overrides for the containers in the desired job templates.
Containers []ContainerOverrides `json:"containers,omitempty"`
Containers []ContainerOverride `json:"containers,omitempty"`

// Overrides for the init container in the desired job templates.
InitContainers []ContainerOverrides `json:"initContainers,omitempty"`
InitContainers []ContainerOverride `json:"initContainers,omitempty"`

// Overrides for the Pod volume configuration.
Volumes []corev1.Volume `json:"volumes,omitempty"`
Expand All @@ -213,7 +219,7 @@ type PodSpecOverrides struct {

// ContainerOverrides represents parameters that can be overridden using PodSpecOverrides.
// Parameters from the Trainer, DatasetConfig, and ModelConfig will take precedence.
type ContainerOverrides struct {
type ContainerOverride struct {
// Name for the container. TrainingRuntime must have this container.
Name string `json:"name"`

Expand Down
22 changes: 11 additions & 11 deletions pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 273af3c

Please sign in to comment.