Skip to content
This repository has been archived by the owner on May 25, 2023. It is now read-only.

MPI Job support for Kube-Batch #688

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,5 @@ kubernetes.tar.gz
/bazel-*
*.pyc

# Ignore the downloaded dind-cluster script
hack/dind-cluster-*
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ verify: generate-code
init:
mkdir -p ${BIN_DIR}

generate-code: init
go build -o ${BIN_DIR}/deepcopy-gen ./cmd/deepcopy-gen/
${BIN_DIR}/deepcopy-gen -i ./pkg/apis/scheduling/v1alpha1/ -O zz_generated.deepcopy
generate-code:
go get -u k8s.io/code-generator/...
hack/update-codegen.sh

rel_bins:
go get github.com/mitchellh/gox
Expand Down
71 changes: 71 additions & 0 deletions config/crds/scheduling_v1alpha1_mpi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
creationTimestamp: null
labels:
controller-tools.k8s.io: "1.0"
name: mpis.scheduling.incubator.k8s.io
spec:
group: scheduling.incubator.k8s.io
names:
kind: MPI
plural: mpis
scope: Namespaced
validation:
openAPIV3Schema:
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds'
type: string
metadata:
type: object
spec:
properties:
job:
description: 'INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
Important: Run "make" to regenerate code after modifying this file'
type: object
required:
- job
type: object
status:
properties:
descriptors:
description: additional job descriptors
type: object
job:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed state
of cluster Important: Run "make" to regenerate code after modifying
this file the job status'
type: object
podGroup:
description: the podGroup status
properties:
failed:
description: The number of pods which reached phase Failed.
format: int32
type: integer
running:
description: The number of actively running pods.
format: int32
type: integer
succeeded:
description: The number of pods which reached phase Succeeded.
format: int32
type: integer
type: object
type: object
version: v1alpha1
status:
acceptedNames:
kind: ""
plural: ""
conditions: []
storedVersions: []
71 changes: 71 additions & 0 deletions deployment/kube-batch/templates/scheduling_v1alpha1_mpi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
creationTimestamp: null
labels:
controller-tools.k8s.io: "1.0"
name: mpis.scheduling.incubator.k8s.io
spec:
group: scheduling.incubator.k8s.io
names:
kind: MPI
plural: mpis
scope: Namespaced
validation:
openAPIV3Schema:
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds'
type: string
metadata:
type: object
spec:
properties:
job:
description: 'INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
Important: Run "make" to regenerate code after modifying this file'
type: object
required:
- job
type: object
status:
properties:
descriptors:
description: additional job descriptors
type: object
job:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed state
of cluster Important: Run "make" to regenerate code after modifying
this file the job status'
type: object
podGroup:
description: the podGroup status
properties:
failed:
description: The number of pods which reached phase Failed.
format: int32
type: integer
running:
description: The number of actively running pods.
format: int32
type: integer
succeeded:
description: The number of pods which reached phase Succeeded.
format: int32
type: integer
type: object
type: object
version: v1alpha1
status:
acceptedNames:
kind: ""
plural: ""
conditions: []
storedVersions: []
58 changes: 58 additions & 0 deletions doc/design/mpi.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
## MPI Jobs in Kube-Batch

jeefy marked this conversation as resolved.
Show resolved Hide resolved
Kube-Batch has support for an MPI-style job. This combines PodGroups with additional Kubernetes primitives
to spin up and allow multiple pods to communicate between one another via [MPI](https://www.open-mpi.org/).

### How it works

An MPI object is comprised of a JobSpec and a PodGroupSpec. When an MPI object is created, it:

1. Creates a ServiceAccount, Role, and Rolebinding. This allows us granular control over what permissions
the pods have, as well as supporting some cool NetworkPolicy features.
2. Creates two blank ConfigMap objects. One contains the MPI hostfile, the other contains data for
/etc/hosts as well as the init scripts.
3. Creates a Job and a PodGroup keyed to the name of the MPI Object
4. Once all Pods have been created, Kube-Batch will update the ConfigMap objects with the IP addresses of
all running MPI-Job Pods, as well as designate one Pod as the "Executor" -- the Pod that will actually
run the desired command.
5. The Pod designated as the executor will download `kubectl` and use that as the transport channel to
communicate with the other pods in the PodGroup. Once pod connection is established, it will begin
running the MPI job.

### Caveats

Any image used in the JobSpec of the MPI Object must already have MPI support baked in. For our example,
we use `continuse/mpich:v3` as a base MPI image.

MPI jobs take some time to start once scheduled. This is due to, at runtime, downloading things like
`kubectl` and waiting for the state of all pods to be reconciled. There are many optimizations that
should be made over time.

### Example YAML

```yaml
apiVersion: scheduling.incubator.k8s.io/v1alpha1
kind: MPI
metadata:
name: test-mpi
spec:
job:
apiVersion: batch/v1
kind: Job
spec:
jeefy marked this conversation as resolved.
Show resolved Hide resolved
completions: 6
parallelism: 6
template:
spec:
containers:
- image: continuse/mpich:v3
imagePullPolicy: IfNotPresent
name: mpi
command: ["mpirun", "-n", "6", "hostname"]
resources:
requests:
cpu: "200m"
podGroup:
spec:
minMember: 6
```
19 changes: 19 additions & 0 deletions example/mpi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: scheduling.incubator.k8s.io/v1alpha1
kind: MPI
metadata:
name: test-mpi
namespace: default
spec:
job:
completions: 6
parallelism: 6
template:
spec:
containers:
- image: continuse/mpich:v3
imagePullPolicy: IfNotPresent
name: mpi
command: ["mpirun", "-n", "6", "hostname"]
resources:
requests:
cpu: "200m"
7 changes: 6 additions & 1 deletion hack/run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,18 @@ curl ${dind_url} --output ${dind_dest}
chmod +x ${dind_dest}
${dind_dest} up

kubectl create -f config/crds/scheduling_v1alpha1_mpi.yaml
kubectl create -f config/crds/scheduling_v1alpha1_podgroup.yaml
kubectl create -f config/crds/scheduling_v1alpha1_queue.yaml
kubectl create -f config/queue/default.yaml

sleep 5;

# start kube-batch
nohup ${KA_BIN}/kube-batch --kubeconfig ${HOME}/.kube/config --scheduler-conf=config/kube-batch-conf.yaml --logtostderr --v ${LOG_LEVEL} > scheduler.log 2>&1 &

sleep 5;

# clean up
function cleanup {
killall -9 kube-batch
Expand All @@ -35,4 +40,4 @@ function cleanup {
trap cleanup EXIT

# Run e2e test
go test ./test/e2e -v -timeout 30m
go test ./test/e2e -v -timeout 30m
9 changes: 9 additions & 0 deletions hack/update-codegen.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash

go get -u k8s.io/code-generator

cd $GOPATH/src/k8s.io/code-generator

./generate-groups.sh all "github.com/kubernetes-sigs/kube-batch/pkg/client" "github.com/kubernetes-sigs/kube-batch/pkg/apis" scheduling:v1alpha1

cd $GOPATH/src/github.com/kubernetes-sigs/kube-batch/
2 changes: 2 additions & 0 deletions pkg/apis/scheduling/v1alpha1/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ func Resource(resource string) schema.GroupResource {
// addKnownTypes adds the set of types defined in this package to the supplied scheme.
func addKnownTypes(scheme *runtime.Scheme) error {
scheme.AddKnownTypes(SchemeGroupVersion,
&MPI{},
&MPIList{},
&PodGroup{},
&PodGroupList{},
&Queue{},
Expand Down
45 changes: 44 additions & 1 deletion pkg/apis/scheduling/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,53 @@ limitations under the License.
package v1alpha1

import (
"k8s.io/api/core/v1"
batchv1 "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// MPISpec defines the desired state of MPI
type MPISpec struct {
Job batchv1.JobSpec `json:"job"`
}

// MPIStatus defines the observed state of MPI
type MPIStatus struct {
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
// Important: Run "make" to regenerate code after modifying this file
// the job status
// +optional
Job batchv1.JobStatus `json:"job,omitempty"`
// the podGroup status
// +optional
PodGroup PodGroupStatus `json:"podGroup,omitempty"`
// additional job descriptors
// +optional
Descriptors map[string]string `json:"descriptors,omitempty"`
}

// +genclient
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

// MPI is the Schema for the mpis API
// +k8s:openapi-gen=true
type MPI struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec MPISpec `json:"spec,omitempty"`
Status MPIStatus `json:"status,omitempty"`
}

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

// MPIList contains a list of MPI
type MPIList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []MPI `json:"items"`
}

// PodGroupPhase is the phase of a pod group at the current time.
type PodGroupPhase string

Expand Down
Loading