Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
johnugeorge committed Apr 28, 2024
2 parents e8dea4c + f8f7363 commit 3b64a8c
Show file tree
Hide file tree
Showing 128 changed files with 23,220 additions and 24,073 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/build-and-publish-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ jobs:
- name: Checkout
uses: actions/checkout@v3

- name: Free-Up Disk Space
uses: ./.github/workflows/free-up-disk-space

- name: Docker Login
# Trigger workflow only for kubeflow/training-operator repository with specific branch (master, v.*-branch) or tag (v.*).
if: >-
Expand Down
49 changes: 49 additions & 0 deletions .github/workflows/free-up-disk-space/action.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Free-Up Disk Space
description: Remove Non-Essential Tools And Move Docker Data Directory to /mnt/docker

runs:
using: composite
steps:
# This step is a Workaround to avoid the "No space left on device" error.
# ref: https://github.com/actions/runner-images/issues/2840
- name: Remove unnecessary files
shell: bash
run: |
echo "Disk usage before cleanup:"
df -hT
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/swift
echo "Disk usage after cleanup:"
df -hT
- name: Prune docker images
shell: bash
run: |
docker image prune -a -f
docker system df
df -hT
- name: Move docker data directory
shell: bash
run: |
echo "Stopping docker service ..."
sudo systemctl stop docker
DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
DOCKER_ROOT_DIR=/mnt/docker
echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
echo "Starting docker service ..."
sudo systemctl daemon-reload
sudo systemctl start docker
echo "Docker service status:"
sudo systemctl --no-pager -l -o short status docker
39 changes: 13 additions & 26 deletions .github/workflows/integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
# strategy:
# fail-fast: false
# matrix:
# kubernetes-version: [""v1.25.11", "v1.26.6", "v1.27.3"]
# kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
# gang-scheduler-name: ["none", "scheduler-plugins", "volcano"]
# ```
# The difference is that each combination is randomly assigned various Python versions
Expand All @@ -26,54 +26,41 @@ jobs:
matrix:
# TODO (tenzen-y): Add volcano.
include:
- kubernetes-version: v1.26.6
- kubernetes-version: v1.29.2
gang-scheduler-name: "none"
python-version: "3.10"
- kubernetes-version: v1.27.3
- kubernetes-version: v1.27.11
gang-scheduler-name: "none"
python-version: "3.7"
- kubernetes-version: v1.25.11
- kubernetes-version: v1.28.7
gang-scheduler-name: "none"
python-version: "3.8"
- kubernetes-version: v1.26.6
- kubernetes-version: v1.29.2
gang-scheduler-name: "scheduler-plugins"
python-version: "3.9"
- kubernetes-version: v1.27.3
- kubernetes-version: v1.27.11
gang-scheduler-name: "scheduler-plugins"
python-version: "3.10"
- kubernetes-version: v1.25.11
- kubernetes-version: v1.28.7
gang-scheduler-name: "scheduler-plugins"
python-version: "3.10"
- kubernetes-version: v1.26.6
- kubernetes-version: v1.29.2
gang-scheduler-name: "volcano"
python-version: "3.9"
- kubernetes-version: v1.27.3
- kubernetes-version: v1.27.11
gang-scheduler-name: "volcano"
python-version: "3.10"
- kubernetes-version: v1.25.11
- kubernetes-version: v1.28.7
gang-scheduler-name: "volcano"
python-version: "3.10"

steps:
# This step is a Workaround to avoid the "No space left on device" error.
# ref: https://github.com/actions/runner-images/issues/2840
- name: Remove unnecessary files
shell: bash
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/swift
echo "Disk usage after cleanup:"
df -h
- name: Checkout
uses: actions/checkout@v3

- name: Free-Up Disk Space
uses: ./.github/workflows/free-up-disk-space

- name: Setup Python
uses: actions/setup-python@v4
with:
Expand Down
11 changes: 6 additions & 5 deletions .github/workflows/publish-example-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@ jobs:
- component-name: mxnet-auto-tuning
dockerfile: examples/mxnet/tune/Dockerfile
context: examples/mxnet/tune
# TODO (tenzen-y): Fix the below broken Dockerfiles
# - component-name: pytorch-dist-mnist-mpi
# dockerfile: examples/pytorch/mnist/Dockerfile-mpi
# - component-name: pytorch-dist-mnist
# dockerfile: examples/pytorch/mnist/Dockerfile
- component-name: pytorch-dist-mnist
dockerfile: examples/pytorch/mnist/Dockerfile
context: examples/pytorch/mnist
- component-name: pytorch-dist-mnist-mpi
dockerfile: examples/pytorch/mnist/Dockerfile-mpi
context: examples/pytorch/mnist
46 changes: 1 addition & 45 deletions .github/workflows/template-publish-image/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,50 +23,6 @@ inputs:
runs:
using: composite
steps:
# This step is a Workaround to avoid the "No space left on device" error.
# ref: https://github.com/actions/runner-images/issues/2840
- name: Remove unnecessary files
shell: bash
run: |
echo "Disk usage before cleanup:"
df -hT
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/swift
echo "Disk usage after cleanup:"
df -hT
- name: Prune docker images
shell: bash
run: |
docker image prune -a -f
docker system df
df -hT
- name: Move docker data directory
shell: bash
run: |
echo "Stopping docker service ..."
sudo systemctl stop docker
DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
DOCKER_ROOT_DIR=/mnt/docker
echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
echo "Starting docker service ..."
sudo systemctl daemon-reload
sudo systemctl start docker
echo "Docker service status:"
sudo systemctl --no-pager -l -o short status docker
- name: Setup QEMU
uses: docker/setup-qemu-action@v2
with:
Expand All @@ -93,4 +49,4 @@ runs:
push: ${{ inputs.push }}
tags: ${{ steps.meta.outputs.tags }}
cache-from: type=gha
cache-to: type=gha,mode=max
cache-to: type=gha
2 changes: 1 addition & 1 deletion .github/workflows/test-python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
- name: Check Python code with Black
uses: psf/black@stable
with:
version: 23.9.1
version: 24.2.0
options: --check --exclude '/*kubeflow_org_v1*|__init__.py|api_client.py|configuration.py|exceptions.py|rest.py'
src: sdk/

Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/unittests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ jobs:
strategy:
fail-fast: false
matrix:
kubernetes-version: ["1.25.0", "1.26.1", "1.27.1"]
# Detail: `setup-envtest list`
kubernetes-version: ["1.27.1", "1.28.3", "1.29.1"]

steps:
- name: Check out code
Expand Down
15 changes: 9 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ help: ## Display this help.
##@ Development

manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/..." output:crd:artifacts:config=manifests/base/crds output:rbac:artifacts:config=manifests/base/rbac
$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/..." \
output:crd:artifacts:config=manifests/base/crds \
output:rbac:artifacts:config=manifests/base/rbac \
output:webhook:artifacts:config=manifests/base/webhook

generate: controller-gen ## Generate apidoc, sdk and code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate/boilerplate.go.txt" paths="./pkg/apis/..."
Expand All @@ -57,12 +60,12 @@ vet: ## Run go vet against code.
GOLANGCI_LINT=$(shell which golangci-lint)
golangci-lint:
ifeq ($(GOLANGCI_LINT),)
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.53.3
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.57.2
$(info golangci-lint has been installed)
endif
golangci-lint run --timeout 5m --go 1.20 ./...
golangci-lint run --timeout 5m --go 1.22 ./...

ENVTEST_K8S_VERSION ?= 1.27
ENVTEST_K8S_VERSION ?= 1.29
HAS_SETUP_ENVTEST := $(shell command -v setup-envtest;)

testall: manifests generate fmt vet golangci-lint test ## Run tests.
Expand All @@ -72,7 +75,7 @@ test: envtest

envtest:
ifndef HAS_SETUP_ENVTEST
go install sigs.k8s.io/controller-runtime/tools/setup-envtest@116a1b831fffe7ccc3c8145306c3e1a3b1b14ffa # v0.15.0
go install sigs.k8s.io/controller-runtime/tools/setup-envtest@bf15e44028f908c790721fc8fe67c7bf2d06a611 # v0.17.2
@echo "setup-envtest has been installed"
endif
@echo "setup-envtest has already installed"
Expand Down Expand Up @@ -108,7 +111,7 @@ PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))

CONTROLLER_GEN = $(shell pwd)/bin/controller-gen
controller-gen: ## Download controller-gen locally if necessary.
GOBIN=$(PROJECT_DIR)/bin go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.13.0
GOBIN=$(PROJECT_DIR)/bin go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.14.0

KUSTOMIZE = $(shell pwd)/bin/kustomize
kustomize: ## Download kustomize locally if necessary.
Expand Down
2 changes: 1 addition & 1 deletion build/images/kubectl-delivery/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM alpine:3.17 AS build

# Install kubectl.
ENV K8S_VERSION v1.27.5
ENV K8S_VERSION v1.29.3

RUN apk add --no-cache wget
RUN wget -q https://storage.googleapis.com/kubernetes-release/release/${K8S_VERSION}/bin/linux/amd64/kubectl
Expand Down
2 changes: 1 addition & 1 deletion build/images/training-operator/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Build the manager binary
FROM golang:1.20 as builder
FROM golang:1.22 as builder

WORKDIR /workspace
# Copy the Go Modules manifests
Expand Down
Loading

0 comments on commit 3b64a8c

Please sign in to comment.