Skip to content

Workflow dispatch for tpu type. #365

Workflow dispatch for tpu type.

Workflow dispatch for tpu type. #365

Workflow file for this run

# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
name: Nightly Tests
on:
pull_request:
workflow_dispatch:
inputs:
tpuType:
description: 'TPU Type'
required: true
default: 'v4-8'
type: choice
options:
- v4-8
- v5litepod-8
# schedule: # Schedule the job run at 12AM PST daily.
# - cron: '0 8 * * *'
env:
# Names must be unique in parallel running tests.
TPU_TYPE: ${{ inputs.tpuType || 'v4-8' }}
EMPTY_CLUSTER_NAME: nightly-xpk-zero-nodepools
PRIVATE_CLUSTER_NAME: nightly-xpk-private-2-nodepools
TPU_CLUSTER_NAME: nightly-xpk-2-nodepools
WORKLOAD_NAME: xpktest-nightly-${{ github.run_attempt }}
PATHWAYS_TPU_CLUSTER_NAME: pw-nightly-test-2-nodepools
PATHWAYS_WORKLOAD_NAME: xpkpw-nightly-${{ github.run_attempt }}
CLUSTER_NETWORK_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}}"
RAYCLUSTER_TPU_CLUSTER_NAME: rc-nightly-test-2-nodepools
jobs:
cluster-create-and-delete:
runs-on: [ubuntu-22.04]
concurrency: # We support one build test to run at a time currently.
group: nightly-test-cluster-group
cancel-in-progress: false
steps:
- name: Set env variables
run: |
echo "PRIVATE_CLUSTER_NAME=$PRIVATE_CLUSTER_NAME-$TPU_TYPE" >> $GITHUB_ENV
echo "EMPTY_CLUSTER_NAME=$EMPTY_CLUSTER_NAME-$TPU_TYPE" >> $GITHUB_ENV
echo "TPU_CLUSTER_NAME=$TPU_CLUSTER_NAME-$TPU_TYPE" >> $GITHUB_ENV
echo "PATHWAYS_TPU_CLUSTER_NAME=$PATHWAYS_TPU_CLUSTER_NAME-$TPU_TYPE" >> $GITHUB_ENV
echo "RAYCLUSTER_TPU_CLUSTER_NAME=$RAYCLUSTER_TPU_CLUSTER_NAME-$TPU_TYPE" >> $GITHUB_ENV
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install expect package
run: sudo apt-get install expect
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Verify gcp setup
run: gcloud info
- name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands.
run: |
gcloud config set compute/zone us-east4-a
gcloud config get compute/zone
- name: Install xpk dependencies
run: |
make install
echo $PWD/bin >> "$GITHUB_PATH"
- name: Check xpk installation
run: xpk --help
- name: Create an XPK Cluster with zero node pools
run: python xpk.py cluster create --cluster $EMPTY_CLUSTER_NAME --tpu-type=v4-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
- name: Delete the cluster created
run: python xpk.py cluster delete --cluster $EMPTY_CLUSTER_NAME --zone=us-central2-b --force
if: always()
- name: Create a Private XPK Cluster with zero node pools
run: python xpk.py cluster create --cluster $PRIVATE_CLUSTER_NAME --private --tpu-type=v4-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${CLUSTER_NETWORK_ARGUMENTS}'
- name: Verify the created cluster is private
run: gcloud container clusters describe $PRIVATE_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
- name: Delete the cluster created
run: python xpk.py cluster delete --cluster $PRIVATE_CLUSTER_NAME --zone=us-central2-b --force
if: always()
- name: Create an XPK Cluster with 2x v4-8 nodepools
run: python xpk.py cluster create --cluster $TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a base-docker-image workload
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b
- name: List out the workloads on the cluster
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Run xpk inspector with the workload created above
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
- name: Wait for workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
- name: Run xpk info command
run : python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Delete the workload on the cluster
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Create test script to execute in batch
run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh
- name: Run a batch job on the cluster
run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3
- name: List out the jobs on the cluster
run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-'
- name: Get created job name
run: |
JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
- name: Check job spec
run: |
job_spec=$(kubectl get job ${JOB_NAME} -o jsonpath='{.spec}')
echo "$job_spec" | grep '"completions":2'
echo "$job_spec" | grep '"parallelism":2'
echo "$job_spec" | jq '.template.spec.containers | length' | grep 3
- name: Get job info for the last job created on the cluster
run: python3 xpk.py job info ${JOB_NAME} | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
- name: Cancel the batch job on the cluster
run: |
python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep "job.batch/${JOB_NAME} deleted"
- name: Create shell and exit it immidiatelly
run: |
cat <<'EOF' >> create-shell.exp
##!/usr/bin/expect
spawn python3 ./xpk.py shell
expect "/ # "
send "exit\n"
EOF
chmod +x ./create-shell.exp
expect ./create-shell.exp
- name: Check if shell exists and is running
run: kubectl get pods | grep xpk-def-app-profile-interactive- | grep Running
- name: Stop the shell
run: python3 xpk.py shell stop
- name: Delete the cluster created
if: always()
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --force
# pw-cluster-and-workload:
# runs-on: [ubuntu-22.04]
# concurrency: # We support one build test to run at a time currently.
# group: nightly-pw-test-cluster-group
# cancel-in-progress: false
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# with:
# python-version: '3.10'
# - uses: 'google-github-actions/auth@v2'
# with:
# credentials_json: '${{ secrets.GCP_SA_KEY }}'
# - uses: google-github-actions/setup-gcloud@v2
# with:
# version: '>= 363.0.0'
# install_components: 'beta,gke-gcloud-auth-plugin'
# - name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands.
# run: |
# gcloud config set compute/zone us-east4-a
# gcloud config get compute/zone
# - name: Install xpk dependencies
# run: |
# make install
# echo $PWD/bin >> "$GITHUB_PATH"
# - name: Check xpk installation
# run: xpk --help
# - name: Create an Pathways-enabled XPK Cluster with 2 x v4-8 nodepools
# run: python xpk.py cluster create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_NETWORK_ARGUMENTS}"
# - name: Create test script to execute in workloads
# run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
# - name: Run a Pathways workload on Ubuntu base image
# run: python xpk.py workload create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
# - name: Wait for Pathways workload completion and confirm it succeeded
# run: python3 xpk.py workload list --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
# - name: Delete the Pathways workload on the cluster
# run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b
# - name: Delete the Pathways cluster created
# if: always()
# run: python xpk.py cluster delete --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b --force
#
# rc-cluster:
# runs-on: [ubuntu-22.04]
# concurrency: # We support one build test to run at a time currently.
# group: nightly-rc-test-cluster-group
# cancel-in-progress: false
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# with:
# python-version: '3.10'
# - uses: 'google-github-actions/auth@v2'
# with:
# credentials_json: '${{ secrets.GCP_SA_KEY }}'
# - uses: google-github-actions/setup-gcloud@v2
# with:
# version: '>= 363.0.0'
# install_components: 'beta,gke-gcloud-auth-plugin'
# - name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands.
# run: |
# gcloud config set compute/zone us-east4-a
# gcloud config get compute/zone
# - name: Install xpk dependencies
# run: |
# make install
# echo $PWD/bin >> "$GITHUB_PATH"
# - name: Check xpk installation
# run: xpk --help
# - name: Create a RayCluster-enabled XPK Cluster with 2 x v4-8 nodepools
# run: python xpk.py cluster create-ray --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --ray-version=2.39.0 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS}}'
# - name: Delete the RayCluster-enabled XPK cluster
# if: always()
# run: python xpk.py cluster delete --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --zone=us-central2-b