Skip to content

Commit

Permalink
Introduce Storage API (#192)
Browse files Browse the repository at this point in the history
* Introduce Storage API

* Fix Gh CI/CD

* Improve getting storages to mount to workload

* Improve handling exception while deleting storage

* Fix adding necessary args and cluster credentials

* Fix storage class, differentiate storage types in create and delete commands, add --type validation, reduce number of CRD installations
  • Loading branch information
PBundyra authored Oct 21, 2024
1 parent e26ad1c commit 73af01e
Show file tree
Hide file tree
Showing 20 changed files with 916 additions and 84 deletions.
32 changes: 27 additions & 5 deletions .github/workflows/build_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ env:
TPU_CLUSTER_NAME: build-xpk-2-v4-8-nodepools
WORKLOAD_NAME: xpktest-build-${{ github.run_attempt }}
PATHWAYS_WORKLOAD_NAME: xpkpw-build-${{ github.run_attempt }}
STORAGE_NAME: test-storage

jobs:
cluster-create-and-delete:
Expand Down Expand Up @@ -54,19 +55,38 @@ jobs:
pip install .
xpk --help
- name: Create a Pathways-enabled XPK Cluster with 2x v4-8 nodepools. Larger num-nodes to avoid master resizing.
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}'
run: |
python3 xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 \
--zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 \
--reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --enable-workload-identity --enable-gcsfuse-csi-driver
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create auto-mount Storage instance
run: |
python3 xpk.py storage create $STORAGE_NAME --cluster=$TPU_CLUSTER_NAME --zone=us-central2-b --type=gcsfuse \
--auto-mount=true \
--mount-point='/test-mount-point' --readonly=false --manifest='./tests/data/pv-pvc-templates.yaml'
- name: List and verify existing Storages
run: python3 xpk.py storage list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee output.txt | grep 'test-storage' || (echo 'No storage found' && cat output.txt && exit 1)
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > test.sh
run: |
echo -e \
'#!/bin/bash \n
echo "Hello world from a test script!"' \
> test.sh
- name: Run a base-docker-image workload
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash test.sh" --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b
run: |
python3 xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash test.sh" \
--tpu-type=v4-8 --num-slices=2 --zone=us-central2-b
- name: Run xpk inspector with the workload created above
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
- name: Wait for workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
- name: Run a Pathways workload on Ubuntu base image
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
run: |
python3 xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME \
--docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b \
--command "echo \"Hello world from a test script! \""
- name: Wait for Pathways workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
- name: List out the workloads on the cluster
Expand All @@ -75,9 +95,11 @@ jobs:
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Delete the Pathways workload on the cluster
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Delete existing Storage
run: python3 xpk.py storage delete $STORAGE_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Delete the cluster created
if: always()
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python3 xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b



Expand Down
9 changes: 7 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@ keywords = []

# pip dependencies installed with `pip install -e .`
dependencies = [
"cloud-accelerator-diagnostics"
"cloud-accelerator-diagnostics",
"kubernetes",
"google-cloud",
"google-api-core",
"tabulate",
]

[project.urls]
Expand All @@ -57,8 +61,9 @@ dev = [
version = {attr = "xpk.core.core.__version__"}

[tool.setuptools]
packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands"]
packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.api"]
package-dir = {"" = "src"}
package-data = {"xpk.api" = ["storage_crd.yaml"]}

[tool.pyink]
# Formatting configuration to follow Google style-guide.
Expand Down
15 changes: 15 additions & 0 deletions src/xpk/api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""
Copyright 2024 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
52 changes: 52 additions & 0 deletions src/xpk/api/storage_crd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: storages.xpk.x-k8s.io
spec:
group: xpk.x-k8s.io
versions:
- name: v1
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
properties:
type:
type: string
cluster:
type: string
auto_mount:
type: boolean
mount_point:
type: string
readonly:
type: boolean
manifest:
type: string
pv:
type: string
pvc:
type: string
required:
- type
- cluster
- auto_mount
- mount_point
- readonly
- manifest
- pvc
- pv
x-kubernetes-validations:
- message: Value is immutable
rule: self == oldSelf
scope: Cluster
names:
plural: storages
singular: storage
kind: Storage
shortNames:
- stg
52 changes: 11 additions & 41 deletions src/xpk/commands/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,7 @@
limitations under the License.
"""

from ..core.commands import (
run_command_for_value,
run_command_with_updates,
run_command_with_updates_retry,
)
from ..core.commands import run_command_for_value, run_command_with_updates
from ..core.core import (
VERTEX_TENSORBOARD_FEATURE_FLAG,
add_zone_and_project,
Expand All @@ -27,6 +23,7 @@
create_vertex_tensorboard,
delete_cluster_subnets,
get_all_clusters_programmatic,
get_cluster_credentials,
get_gke_control_plane_version,
get_gke_node_pool_version,
get_gke_server_config,
Expand All @@ -35,9 +32,10 @@
run_gke_node_pool_create_command,
set_jobset_on_cluster,
set_up_cluster_network_for_gpu,
setup_k8s_env,
update_cluster_with_clouddns_if_necessary,
update_cluster_with_workload_identity_if_necessary,
update_cluster_with_gcsfuse_driver_if_necessary,
update_cluster_with_workload_identity_if_necessary,
zone_to_region,
)
from ..core.kueue import (
Expand All @@ -46,6 +44,7 @@
install_kueue_on_cluster,
)
from ..core.nap import enable_autoprovisioning_on_cluster
from ..core.storage import install_storage_crd
from ..core.system_characteristics import (
AcceleratorType,
AcceleratorTypeToAcceleratorCharacteristics,
Expand Down Expand Up @@ -113,9 +112,7 @@ def cluster_create(args) -> None:
if update_cluster_command_code != 0:
xpk_exit(update_cluster_command_code)

set_cluster_command_code = set_cluster_command(args)
if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)
get_cluster_credentials(args)

# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
tensorboard_config = {}
Expand Down Expand Up @@ -164,6 +161,8 @@ def cluster_create(args) -> None:
if install_kueue_on_cluster_code != 0:
xpk_exit(install_kueue_on_cluster_code)

k8s_client = setup_k8s_env(args)
install_storage_crd(k8s_client)
# Provision node pools dynamically based on incoming workloads:
# Currently autoprovisioning is not supported with Pathways.
autoprovisioning_config = None
Expand Down Expand Up @@ -236,9 +235,7 @@ def cluster_cacheimage(args) -> None:
)
add_zone_and_project(args)

set_cluster_command_code = set_cluster_command(args)
if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)
get_cluster_credentials(args)
system, return_code = get_system_characteristics(args)

if return_code > 0:
Expand Down Expand Up @@ -287,9 +284,7 @@ def cluster_describe(args) -> None:
xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
add_zone_and_project(args)

set_cluster_command_code = set_cluster_command(args)
if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)
get_cluster_credentials(args)

command = (
f'gcloud container node-pools list --cluster {args.cluster} '
Expand Down Expand Up @@ -470,8 +465,8 @@ def run_gke_cluster_create_command(
' --enable-autoscaling'
' --total-min-nodes 1 --total-max-nodes 1000'
f' --num-nodes {args.default_pool_cpu_num_nodes}'
f' {args.custom_cluster_arguments}'
' --release-channel rapid'
f' {args.custom_cluster_arguments}'
)

if system.accelerator_type == AcceleratorType['GPU']:
Expand Down Expand Up @@ -502,28 +497,3 @@ def run_gke_cluster_create_command(
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
return 1
return 0


def set_cluster_command(args) -> int:
"""Run cluster configuration command to set the kubectl config.
Args:
args: user provided arguments for running the command.
Returns:
0 if successful and 1 otherwise.
"""
command = (
'gcloud container clusters get-credentials'
f' {args.cluster} --region={zone_to_region(args.zone)}'
f' --project={args.project} &&'
' kubectl config view && kubectl config set-context --current'
' --namespace=default'
)
task = f'get-credentials to cluster {args.cluster}'
return_code = run_command_with_updates_retry(
command, task, args, verbose=False
)
if return_code != 0:
xpk_print(f'{task} returned ERROR {return_code}')
return return_code
6 changes: 2 additions & 4 deletions src/xpk/commands/inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
CLUSTER_METADATA_CONFIGMAP,
CLUSTER_RESOURCES_CONFIGMAP,
add_zone_and_project,
get_cluster_credentials,
zone_to_region,
)
from ..core.kueue import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
from ..utils import append_tmp_file, write_tmp_file, xpk_exit, xpk_print
from .cluster import set_cluster_command
from .workload import get_workload_list


Expand Down Expand Up @@ -124,9 +124,7 @@ def inspector(args) -> None:
xpk_print(args)

add_zone_and_project(args)
set_cluster_command_code = set_cluster_command(args)
if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)
get_cluster_credentials(args)

inspector_file = write_tmp_file(
'==================\nXPK inspector OUTPUT:\n==================\n'
Expand Down
Loading

0 comments on commit 73af01e

Please sign in to comment.