From e436f18ae2e321a9f0029e3e59f3cb099aeee1a7 Mon Sep 17 00:00:00 2001 From: Robert Bailey Date: Fri, 26 Apr 2024 02:36:32 -0700 Subject: [PATCH] Set the default GKE cluster type for ray to GKE Autopilot. Also add instructions to use a standard cluster if preferred. --- applications/ray/README.md | 58 +++++++++++++++++++++++++------ applications/ray/variables.tf | 4 +-- applications/ray/workloads.tfvars | 9 +++-- 3 files changed, 56 insertions(+), 15 deletions(-) diff --git a/applications/ray/README.md b/applications/ray/README.md index a4d470b38..934ffaa14 100644 --- a/applications/ray/README.md +++ b/applications/ray/README.md @@ -3,22 +3,58 @@ This repository contains a Terraform template for running [Ray](https://www.ray.io/) on Google Kubernetes Engine. See the [Ray on GKE](/ray-on-gke/) directory to see additional guides and references. +## Prerequisites + +1. GCP Project with following APIs enabled + - container.googleapis.com + - iap.googleapis.com (required when using authentication with Identity Aware Proxy) + +2. A functional GKE cluster. + - To create a new standard or autopilot cluster, follow the instructions in [`infrastructure/README.md`](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/infrastructure/README.md) + - Alternatively, you can set the `create_cluster` variable to true in `workloads.tfvars` to provision a new GKE cluster. This will default to creating a GKE Autopilot cluster; if you want to provision a standard cluster you must also set `autopilot_cluster` to false. + +3. This module is configured to optionally use Identity Aware Proxy (IAP) to protect access to the Ray dashboard. It expects the brand & the OAuth consent configured in your org. You can check the details here: [OAuth consent screen](https://console.cloud.google.com/apis/credentials/consent) + +4. Preinstall the following on your computer: + * Terraform + * Gcloud CLI + ## Installation -Preinstall the following on your computer: -* Terraform -* Gcloud +### Configure Inputs -> **_NOTE:_** Terraform keeps state metadata in a local file called `terraform.tfstate`. Deleting the file may cause some resources to not be cleaned up correctly even if you delete the cluster. We suggest using `terraform destory` before reapplying/reinstalling. +1. If needed, clone the repo +``` +git clone https://github.com/GoogleCloudPlatform/ai-on-gke +cd ai-on-gke/applications/ray +``` -1. If needed, git clone https://github.com/GoogleCloudPlatform/ai-on-gke +2. Edit `workloads.tfvars` with your GCP settings. + +**Important Note:** +If using this with the Jupyter module (`applications/jupyter/`), it is recommended to use the same k8s namespace +for both i.e. set this to the same namespace as `applications/jupyter/workloads.tfvars`. + +| Variable | Description | Required | +|-----------------------------|----------------------------------------------------------------------------------------------------------------|:--------:| +| project_id | GCP Project Id | Yes | +| cluster_name | GKE Cluster Name | Yes | +| cluster_location | GCP Region | Yes | +| kubernetes_namespace | The namespace that Ray and rest of the other resources will be installed in. | Yes | +| gcs_bucket | GCS bucket to be used for Ray storage | Yes | +| create_service_account | Create service accounts used for Workload Identity mapping | Yes | + + +### Install + +> **_NOTE:_** Terraform keeps state metadata in a local file called `terraform.tfstate`. Deleting the file may cause some resources to not be cleaned up correctly even if you delete the cluster. We suggest using `terraform destory` before reapplying/reinstalling. -2. `cd applications/ray` +3. Ensure your gcloud application default credentials are in place. +``` +gcloud auth application-default login +``` -3. Find the name and location of the GKE cluster you want to use. - Run `gcloud container clusters list --project=` to see all the available clusters. - _Note: If you created the GKE cluster via the infrastructure repo, you can get the cluster info from `platform.tfvars`_ +4. Run `terraform init` -4. Edit `workloads.tfvars` with your environment specific variables and configurations. +5. Run `terraform apply --var-file=./workloads.tfvars`. -5. Run `terraform init && terraform apply --var-file workloads.tfvars` \ No newline at end of file diff --git a/applications/ray/variables.tf b/applications/ray/variables.tf index ff720c010..417a5eb75 100644 --- a/applications/ray/variables.tf +++ b/applications/ray/variables.tf @@ -39,7 +39,7 @@ variable "ray_version" { variable "kubernetes_namespace" { type = string description = "Kubernetes namespace where resources are deployed" - default = "myray" + default = "ml" } variable "enable_grafana_on_ray_dashboard" { @@ -105,7 +105,7 @@ variable "private_cluster" { variable "autopilot_cluster" { type = bool - default = false + default = true } variable "cpu_pools" { diff --git a/applications/ray/workloads.tfvars b/applications/ray/workloads.tfvars index 88065b06f..c8f82f0d2 100644 --- a/applications/ray/workloads.tfvars +++ b/applications/ray/workloads.tfvars @@ -17,11 +17,16 @@ ## Need to pull this variables from tf output from previous platform stage project_id = "" -## this is required for terraform to connect to GKE master and deploy workloads -create_cluster = false # this flag will create a new standard public gke cluster in default network +## This is required for terraform to connect to GKE cluster and deploy workloads. cluster_name = "" cluster_location = "us-central1" +## If terraform should create a new GKE cluster, fill in this section as well. +## By default, a public autopilot GKE cluster will be created in the default network. +## Set the autopilot_cluster variable to false to create a standard cluster instead. +create_cluster = false +autopilot_cluster = true + ####################################################### #### APPLICATIONS #######################################################