Skip to content

Commit

Permalink
Update K8s docker image build and the source artifact registry (skypi…
Browse files Browse the repository at this point in the history
…lot-org#4224)

* Attempt at improving performance of k8s cluster launch

* remove conda env creation

* add multiple regions

* K8s sky launch pulls the new docker images

* Move k8s script

* use us region only

* typo
  • Loading branch information
yika-luo authored Nov 1, 2024
1 parent bf17e87 commit bc51eae
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 48 deletions.
24 changes: 9 additions & 15 deletions Dockerfile_k8s
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM continuumio/miniconda3:23.3.1-0
FROM --platform=linux/amd64 continuumio/miniconda3:23.3.1-0

# TODO(romilb): Investigate if this image can be consolidated with the skypilot
# client image (`Dockerfile`)
Expand Down Expand Up @@ -33,21 +33,15 @@ ENV HOME /home/sky
# Set current working directory
WORKDIR /home/sky

# Install SkyPilot pip dependencies preemptively to speed up provisioning time
RUN conda init && \
pip install wheel Click colorama cryptography jinja2 jsonschema networkx \
oauth2client pandas pendulum PrettyTable rich tabulate filelock packaging \
'protobuf<4.0.0' pulp pycryptodome==3.12.0 docker kubernetes==28.1.0 \
grpcio==1.51.3 python-dotenv==1.0.1 ray[default]==2.9.3 && \
# Install skypilot dependencies
RUN conda init && export PIP_DISABLE_PIP_VERSION_CHECK=1 && \
python3 -m venv ~/skypilot-runtime && \
PYTHON_EXEC=$(echo ~/skypilot-runtime)/bin/python && \
$PYTHON_EXEC -m pip install 'skypilot-nightly[remote,kubernetes]' 'ray[default]==2.9.3' 'pycryptodome==3.12.0' && \
$PYTHON_EXEC -m pip uninstall skypilot-nightly -y && \
curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl

# Add /home/sky/.local/bin/ to PATH
RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc

# Copy SkyPilot code base. This is required for the ssh jump pod to find the
# lifecycle management scripts
COPY --chown=sky . /skypilot/sky/
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && \
echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc

# Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately
ENV PYTHONUNBUFFERED=1
19 changes: 7 additions & 12 deletions Dockerfile_k8s_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,14 @@ RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x8
eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true && conda activate base && \
grep "# >>> conda initialize >>>" ~/.bashrc || { conda init && source ~/.bashrc; } && \
rm Miniconda3-Linux-x86_64.sh && \
pip install wheel Click colorama cryptography jinja2 jsonschema networkx \
oauth2client pandas pendulum PrettyTable rich tabulate filelock packaging \
'protobuf<4.0.0' pulp pycryptodome==3.12.0 docker kubernetes==28.1.0 \
grpcio==1.51.3 python-dotenv==1.0.1 ray[default]==2.9.3 && \
export PIP_DISABLE_PIP_VERSION_CHECK=1 && \
python3 -m venv ~/skypilot-runtime && \
PYTHON_EXEC=$(echo ~/skypilot-runtime)/bin/python && \
$PYTHON_EXEC -m pip install 'skypilot-nightly[remote,kubernetes]' 'ray[default]==2.9.3' 'pycryptodome==3.12.0' && \
$PYTHON_EXEC -m pip uninstall skypilot-nightly -y && \
curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl

# Add /home/sky/.local/bin/ to PATH
RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc

# Copy SkyPilot code base. This is required for the ssh jump pod to find the
# lifecycle management scripts
COPY --chown=sky . /skypilot/sky/
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && \
echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc

# Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately
ENV PYTHONUNBUFFERED=1
4 changes: 2 additions & 2 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ class Kubernetes(clouds.Cloud):
'Kubernetes.',
}

IMAGE_CPU = 'skypilot:cpu-ubuntu-2004'
IMAGE_GPU = 'skypilot:gpu-ubuntu-2004'
IMAGE_CPU = 'skypilot:custom-cpu-ubuntu-2004'
IMAGE_GPU = 'skypilot:custom-gpu-ubuntu-2004'

PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
Expand Down
12 changes: 10 additions & 2 deletions sky/clouds/service_catalog/images/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ packer build ${IMAGE}.pkr.hcl
2. Make the image public
```bash
# Make image public
export IMAGE_NAME=skypilot-gcp-cpu-ubuntu-20241029144600 # Update this
export IMAGE_NAME=skypilot-gcp-gpu-ubuntu-241030 # Update this
export IMAGE_ID=projects/sky-dev-465/global/images/${IMAGE_NAME}
gcloud compute images add-iam-policy-binding ${IMAGE_NAME} --member='allAuthenticatedUsers' --role='roles/compute.imageUser'
```
Expand All @@ -46,7 +46,7 @@ packer build ${IMAGE}.pkr.hcl
2. Copy images to all regions
```bash
export TYPE=gpu # Update this
export IMAGE_ID=ami-05e9f5efd844f1a4f # Update this
export IMAGE_ID=ami-0989556a89639b1bb # Update this
python aws_utils/image_gen.py --image-id ${IMAGE_ID} --processor ${TYPE}
```
3. Add fallback images if any region failed \
Expand All @@ -65,6 +65,14 @@ packer build --var vm_generation=1 --var client_secret=${SECRET} skypilot-azure-
packer build --var vm_generation=2 --var client_secret=${SECRET} --var use_grid_driver=true skypilot-azure-gpu-ubuntu.pkr.hcl
```

### Kubernetes
1. Build the image
```bash
export REGION=europe # Update this: us, europe, asia
./skypilot-k8s-image.sh -p -l -r ${REGION}
./skypilot-k8s-image.sh -p -l -g -r ${REGION}
```

## Test Images
1. Minimal GPU test: `sky launch --image ${IMAGE_ID} --gpus=L4:1 --cloud ${CLOUD}` then run `nvidia-smi` in the launched instance.
2. Update the image ID in `sky/clouds/gcp.py` and run the test:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,38 +1,38 @@
#!/bin/bash
# Builds the Dockerfile_k8s image as the SkyPilot image.
# Optionally, if -p is specified, pushes the image to the registry.
# Uses buildx to build the image for both amd64 and arm64.
# If -p flag is specified, pushes the image to the registry.
# If -g flag is specified, builds the GPU image in Dockerfile_k8s_gpu. GPU image is built only for amd64.
# If -l flag is specified, uses the latest tag instead of the date tag. Date tag is of the form YYYYMMDD.
# Usage: ./build_image.sh [-p] [-g]
# Usage: ./skypilot-k8s-image.sh [-p] [-g] [-l] [-r region]
# -p: Push the image to the registry
# -g: Build the GPU image
# -l: Use latest tag

TAG=us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot

# -g: Builds the GPU image in Dockerfile_k8s_gpu. GPU image is built only for amd64
# -l: Use latest tag instead of the date tag. Date tag is of the form YYYYMMDD
# -r: Specify the region to be us, europe or asia
region=us
push=false
gpu=false
latest=false

# Parse command line arguments
while getopts ":pgl" opt; do
OPTSTRING=":pglr:"
while getopts ${OPTSTRING} opt; do
case ${opt} in
p )
p)
push=true
;;
g )
g)
gpu=true
;;
l )
l)
latest=true
;;
\? )
echo "Usage: ./build_image.sh [-p] [-g] [-l]"
r)
region=${OPTARG}
;;
?)
echo "Usage: ./build_image.sh [-p] [-g] [-l] [-r region]"
echo "-p: Push the image to the registry"
echo "-g: Build the GPU image"
echo "-l: Use latest tag instead of the date tag"
echo "-r: Specify the region to be us, europe or asia"
exit 1
;;
esac
Expand All @@ -42,6 +42,9 @@ echo "Options:"
echo "Push: $push"
echo "GPU: $gpu"
echo "Latest: $latest"
echo "Region: $region"

TAG=$region-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot

# Set the version tag. If the latest flag is used, use the latest tag
if [[ $latest == "true" ]]; then
Expand Down
14 changes: 13 additions & 1 deletion sky/clouds/service_catalog/kubernetes_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@
from typing import Dict, List, Optional, Set, Tuple

from sky import check as sky_check
from sky import sky_logging
from sky.adaptors import common as adaptors_common
from sky.clouds import Kubernetes
from sky.clouds.service_catalog import CloudFilter
from sky.clouds.service_catalog import common
from sky.provision.kubernetes import utils as kubernetes_utils

logger = sky_logging.init_logger(__name__)

if typing.TYPE_CHECKING:
import pandas as pd
else:
Expand All @@ -31,7 +34,16 @@

def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
"""Returns the image id from the tag."""
return common.get_image_id_from_tag_impl(_image_df, tag, region)
global _image_df
image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
if image_id is None:
# Refresh the image catalog and try again, if the image tag is not
# found.
logger.debug('Refreshing the image catalog and trying again.')
_image_df = common.read_catalog('kubernetes/images.csv',
pull_frequency_hours=0)
image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
return image_id


def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
Expand Down

0 comments on commit bc51eae

Please sign in to comment.