From 6d5464229f48569d826089017491fddfce2549e5 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Sat, 16 Jul 2022 20:06:37 +0900 Subject: [PATCH] add the pytorch-mnist with GPU support container image --- .github/workflows/publish-trial-images.yaml | 6 ++++-- .github/workflows/pytorch-mnist-e2e-test.yaml | 2 +- docs/images-location.md | 17 ++++++++++++++--- .../median-stop-with-json-format.yaml | 2 +- .../pytorchjob-mnist.yaml | 4 ++-- .../custom-metrics-collector.yaml | 2 +- ...file-metrics-collector-with-json-format.yaml | 2 +- .../file-metrics-collector.yaml | 2 +- .../{Dockerfile => Dockerfile.cpu} | 0 .../trial-images/pytorch-mnist/Dockerfile.gpu | 15 +++++++++++++++ .../components/controller/trial-templates.yaml | 4 ++-- scripts/v1beta1/build.sh | 7 +++++-- scripts/v1beta1/push.sh | 7 +++++-- scripts/v1beta1/update-images.sh | 6 ++++-- .../v1beta1/scripts/gh-actions/build-load.sh | 10 +++++----- 15 files changed, 61 insertions(+), 25 deletions(-) rename examples/v1beta1/trial-images/pytorch-mnist/{Dockerfile => Dockerfile.cpu} (100%) create mode 100644 examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu diff --git a/.github/workflows/publish-trial-images.yaml b/.github/workflows/publish-trial-images.yaml index 7d3119b9fa9..949afbe4229 100644 --- a/.github/workflows/publish-trial-images.yaml +++ b/.github/workflows/publish-trial-images.yaml @@ -31,8 +31,10 @@ jobs: include: - trial-name: mxnet-mnist dockerfile: examples/v1beta1/trial-images/mxnet-mnist/Dockerfile - - trial-name: pytorch-mnist - dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile + - trial-name: pytorch-mnist-cpu + dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu + - trial-name: pytorch-mnist-gpu + dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu - trial-name: tf-mnist-with-summaries dockerfile: examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile - trial-name: enas-cnn-cifar10-gpu diff --git a/.github/workflows/pytorch-mnist-e2e-test.yaml b/.github/workflows/pytorch-mnist-e2e-test.yaml index 62fa9e73e81..65a1a38d54c 100644 --- a/.github/workflows/pytorch-mnist-e2e-test.yaml +++ b/.github/workflows/pytorch-mnist-e2e-test.yaml @@ -24,7 +24,7 @@ jobs: experiments: ${{ matrix.experiments }} training-operator: true # Comma Delimited - trial-images: pytorch-mnist + trial-images: pytorch-mnist-cpu strategy: fail-fast: false diff --git a/docs/images-location.md b/docs/images-location.md index e390fb4ebdb..ed01bb15016 100644 --- a/docs/images-location.md +++ b/docs/images-location.md @@ -273,13 +273,24 @@ The following table shows images for training containers which are used in the - docker.io/kubeflowkatib/pytorch-mnist + docker.io/kubeflowkatib/pytorch-mnist-cpu - PyTorch MNIST example with printing metrics to the file or StdOut + PyTorch MNIST example with printing metrics to the file or StdOut with CPU support - Dockerfile + Dockerfile + + + + + docker.io/kubeflowkatib/pytorch-mnist-gpu + + + PyTorch MNIST example with printing metrics to the file or StdOut with GPU support + + + Dockerfile diff --git a/examples/v1beta1/early-stopping/median-stop-with-json-format.yaml b/examples/v1beta1/early-stopping/median-stop-with-json-format.yaml index 54e551f672a..6ff73d725a1 100644 --- a/examples/v1beta1/early-stopping/median-stop-with-json-format.yaml +++ b/examples/v1beta1/early-stopping/median-stop-with-json-format.yaml @@ -62,7 +62,7 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/pytorch-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - "/opt/pytorch-mnist/mnist.py" diff --git a/examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml b/examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml index 6ee52a01489..7c79a6ad5b6 100644 --- a/examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml +++ b/examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml @@ -46,7 +46,7 @@ spec: spec: containers: - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - "/opt/pytorch-mnist/mnist.py" @@ -61,7 +61,7 @@ spec: spec: containers: - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - "/opt/pytorch-mnist/mnist.py" diff --git a/examples/v1beta1/metrics-collector/custom-metrics-collector.yaml b/examples/v1beta1/metrics-collector/custom-metrics-collector.yaml index 20d7918104f..0f38b9bb095 100644 --- a/examples/v1beta1/metrics-collector/custom-metrics-collector.yaml +++ b/examples/v1beta1/metrics-collector/custom-metrics-collector.yaml @@ -67,7 +67,7 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/pytorch-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - "/opt/pytorch-mnist/mnist.py" diff --git a/examples/v1beta1/metrics-collector/file-metrics-collector-with-json-format.yaml b/examples/v1beta1/metrics-collector/file-metrics-collector-with-json-format.yaml index 070a33b6700..48c98759722 100644 --- a/examples/v1beta1/metrics-collector/file-metrics-collector-with-json-format.yaml +++ b/examples/v1beta1/metrics-collector/file-metrics-collector-with-json-format.yaml @@ -52,7 +52,7 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/pytorch-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - "/opt/pytorch-mnist/mnist.py" diff --git a/examples/v1beta1/metrics-collector/file-metrics-collector.yaml b/examples/v1beta1/metrics-collector/file-metrics-collector.yaml index 2b74fd01934..367b1e35827 100644 --- a/examples/v1beta1/metrics-collector/file-metrics-collector.yaml +++ b/examples/v1beta1/metrics-collector/file-metrics-collector.yaml @@ -54,7 +54,7 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/pytorch-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - "/opt/pytorch-mnist/mnist.py" diff --git a/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile b/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu similarity index 100% rename from examples/v1beta1/trial-images/pytorch-mnist/Dockerfile rename to examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu diff --git a/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu b/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu new file mode 100644 index 00000000000..cdb6190f247 --- /dev/null +++ b/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu @@ -0,0 +1,15 @@ +FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime + +ADD examples/v1beta1/trial-images/pytorch-mnist /opt/pytorch-mnist +WORKDIR /opt/pytorch-mnist + +# Add folder for the logs. +RUN mkdir /katib +RUN pip install --no-cache-dir -r requirements.txt + +RUN chgrp -R 0 /opt/pytorch-mnist \ + && chmod -R g+rwX /opt/pytorch-mnist \ + && chgrp -R 0 /katib \ + && chmod -R g+rwX /katib + +ENTRYPOINT ["python3", "/opt/pytorch-mnist/mnist.py"] diff --git a/manifests/v1beta1/components/controller/trial-templates.yaml b/manifests/v1beta1/components/controller/trial-templates.yaml index de50cc321bc..916dd3c85b6 100644 --- a/manifests/v1beta1/components/controller/trial-templates.yaml +++ b/manifests/v1beta1/components/controller/trial-templates.yaml @@ -54,7 +54,7 @@ data: spec: containers: - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - "/opt/pytorch-mnist/mnist.py" @@ -68,7 +68,7 @@ data: spec: containers: - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - "/opt/pytorch-mnist/mnist.py" diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 3c4a7020f58..da6f9723731 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -123,8 +123,11 @@ else echo -e "\nBuilding mxnet mnist training container example...\n" docker build --platform linux/amd64 -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile . - echo -e "\nBuilding PyTorch mnist training container example...\n" - docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile . + echo -e "\nBuilding PyTorch mnist training container example with CPU support...\n" + docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist-cpu:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile.cpu . + + echo -e "\nBuilding PyTorch mnist training container example with GPU support...\n" + docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist-gpu:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile.gpu . echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" docker build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu . diff --git a/scripts/v1beta1/push.sh b/scripts/v1beta1/push.sh index 1d098815933..95f5d3c98eb 100755 --- a/scripts/v1beta1/push.sh +++ b/scripts/v1beta1/push.sh @@ -98,8 +98,11 @@ docker push "${REGISTRY}/mxnet-mnist:${TAG}" echo -e "\nPushing Tensorflow with summaries mnist training container example...\n" docker push "${REGISTRY}/tf-mnist-with-summaries:${TAG}" -echo -e "\nPushing PyTorch mnist training container example...\n" -docker push "${REGISTRY}/pytorch-mnist:${TAG}" +echo -e "\nPushing PyTorch mnist training container example with CPU support...\n" +docker push "${REGISTRY}/pytorch-mnist-cpu:${TAG}" + +echo -e "\nPushing PyTorch mnist training container example with GPU support...\n" +docker push "${REGISTRY}/pytorch-mnist-gpu:${TAG}" echo -e "\nPushing Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" docker push "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" diff --git a/scripts/v1beta1/update-images.sh b/scripts/v1beta1/update-images.sh index d532d150dda..b35892ebc5b 100755 --- a/scripts/v1beta1/update-images.sh +++ b/scripts/v1beta1/update-images.sh @@ -83,7 +83,8 @@ update_yaml_files "${CONFIG_PATH}" ":[^[:space:]].*\"" ":${TAG}\"" # Postfixes for the each Trial image. MXNET_MNIST="mxnet-mnist" -PYTORCH_MNIST="pytorch-mnist" +PYTORCH_MNIST_CPU="pytorch-mnist-cpu" +PYTORCH_MNIST_GPU="pytorch-mnist-gpu" TF_MNIST_WITH_SUMMARIES="tf-mnist-with-summaries" ENAS_GPU="enas-cnn-cifar10-gpu" ENAS_CPU="enas-cnn-cifar10-cpu" @@ -93,7 +94,8 @@ SIMPLE_PBT="simple-pbt" echo -e "Update Katib Trial training container images\n" update_yaml_files "./" "${OLD_PREFIX}${MXNET_MNIST}:.*" "${NEW_PREFIX}${MXNET_MNIST}:${TAG}" -update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST}:.*" "${NEW_PREFIX}${PYTORCH_MNIST}:${TAG}" +update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_CPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_CPU}:${TAG}" +update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_GPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_GPU}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${TF_MNIST_WITH_SUMMARIES}:.*" "${NEW_PREFIX}${TF_MNIST_WITH_SUMMARIES}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${ENAS_GPU}:.*" "${NEW_PREFIX}${ENAS_GPU}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${ENAS_CPU}:.*" "${NEW_PREFIX}${ENAS_CPU}:${TAG}" diff --git a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh index a16b4407db3..30fae899236 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh @@ -30,7 +30,7 @@ REGISTRY="docker.io/kubeflowkatib" TAG="e2e-test" VERSION="v1beta1" CMD_PREFIX="cmd" -SPECIFIED_DEVICE_TYPE_IMAGES=("enas-cnn-cifar10-cpu" "darts-cnn-cifar10-cpu") +SPECIFIED_DEVICE_TYPE_IMAGES=("enas-cnn-cifar10-cpu" "darts-cnn-cifar10-cpu" "pytorch-mnist-cpu") IFS="," read -r -a TRIAL_IMAGE_ARRAY <<< "$TRIAL_IMAGES" IFS="," read -r -a EXPERIMENT_ARRAY <<< "$EXPERIMENTS" @@ -51,7 +51,7 @@ _build_containers() { docker build --platform "$(uname -m)" -t "$REGISTRY/$CONTAINER_NAME:$TAG" -f "../../../../../$DOCKERFILE" ../../../../../ } -_load_kind_cluster() { +_load_minikube_cluster() { CONTAINER_NAME=${1:-"katib-controller"} echo -e "\n\nLoading $CONTAINER_NAME image...\n\n" @@ -99,7 +99,7 @@ run() { for s in "${suggestions[@]}"; do if [ "$s" == "$CONTAINER_NAME" ]; then _build_containers "$CONTAINER_NAME" "$DOCKERFILE" - _load_kind_cluster "$CONTAINER_NAME" + _load_minikube_cluster "$CONTAINER_NAME" break fi done @@ -126,7 +126,7 @@ run() { for e in "${earlystoppings[@]}"; do if [ "$e" == "$CONTAINER_NAME" ]; then _build_containers "$CONTAINER_NAME" "$DOCKERFILE" - _load_kind_cluster "$CONTAINER_NAME" + _load_minikube_cluster "$CONTAINER_NAME" break fi done @@ -134,7 +134,7 @@ run() { # Others else _build_containers "$CONTAINER_NAME" "$DOCKERFILE" - _load_kind_cluster "$CONTAINER_NAME" + _load_minikube_cluster "$CONTAINER_NAME" fi }