Skip to content

Commit

Permalink
More CI improvements (#8313)
Browse files Browse the repository at this point in the history
* Reduce clutter in log of Python test

* Set up BuildKite test analytics

* Add separate step for building containers

* Enable incremental update of CI stack; custom agent IAM policy
  • Loading branch information
hcho3 authored Oct 6, 2022
1 parent bc7a6ec commit 50ff8a2
Show file tree
Hide file tree
Showing 11 changed files with 229 additions and 31 deletions.
36 changes: 36 additions & 0 deletions tests/buildkite/build-containers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

set -euo pipefail
set -x

if [ "$#" -lt 1 ]
then
echo "Usage: $0 [container to build]"
return 1
fi
container=$1

source tests/buildkite/conftest.sh

echo "--- Build container ${container}"

BUILD_ARGS=""

case "${container}" in
gpu|rmm)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
;;

jvm_gpu_build)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
;;

*)
echo "Unrecognized container ID: ${container}"
return 2
;;
esac

# Run a no-op command. This will simply build the container and push it to the private registry
tests/ci_build/ci_build.sh ${container} docker ${BUILD_ARGS} bash
19 changes: 19 additions & 0 deletions tests/buildkite/conftest.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
#!/bin/bash

set -euo pipefail

function get_aws_secret {
if [[ $# -ne 1 ]]
then
echo "Usage: get_aws_secret [Name of secret]"
return 1
fi
aws secretsmanager get-secret-value --secret-id $1 --output text --region us-west-2 --query SecretString
}

function set_buildkite_env_vars_in_container {
# Pass all Buildkite-specific env vars to Docker containers.
# This is to be used with tests/ci_build/ci_build.sh
export CI_DOCKER_EXTRA_PARAMS_INIT="${CI_DOCKER_EXTRA_PARAMS_INIT:-} "`
`"--env BUILDKITE_ANALYTICS_TOKEN --env BUILDKITE_BUILD_ID --env BUILDKITE_BUILD_NUMBER "`
`"--env BUILDKITE_JOB_ID --env BUILDKITE_BRANCH --env BUILDKITE_COMMIT "`
`"--env BUILDKITE_MESSAGE --env BUILDKITE_BUILD_URL"
}

set -x

CUDA_VERSION=11.0.3
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
AWSTemplateFormatVersion: "2010-09-09"
Description: "Buildkite agent's IAM policy"

Resources:
BuildkiteAgentManagedPolicy:
Type: AWS::IAM::ManagedPolicy
Properties:
PolicyDocument:
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:*",
"s3-object-lambda:*"
],
"Resource": "*"
},
{
"Effect": "Allow",
"Action": "lambda:InvokeFunction",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": "secretsmanager:GetSecretValue",
"Resource": "*"
}
]
}
123 changes: 103 additions & 20 deletions tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import argparse
import copy
import os
import re

import boto3

import botocore
from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS

current_dir = os.path.dirname(__file__)

TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"


Expand Down Expand Up @@ -33,7 +37,7 @@ def get_default_vpc(*, aws_region):
return ec2.Vpc(default_vpc_id)


def format_params(args, *, stack_id):
def format_params(args, *, stack_id, agent_iam_policy):
default_vpc = get_default_vpc(aws_region=args.aws_region)
azs = get_availability_zones(aws_region=args.aws_region)
# For each of the first two availability zones (AZs), choose the default subnet
Expand All @@ -55,6 +59,7 @@ def format_params(args, *, stack_id):
params["BuildkiteAgentToken"] = args.agent_token
params["VpcId"] = default_vpc.id
params["Subnets"] = ",".join(subnets)
params["ManagedPolicyARN"] = agent_iam_policy
params.update(COMMON_STACK_PARAMS)
return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()]

Expand All @@ -63,34 +68,112 @@ def get_full_stack_id(stack_id):
return f"buildkite-{stack_id}-autoscaling-group"


def stack_exists(args, *, stack_name):
client = boto3.client("cloudformation", region_name=args.aws_region)
waiter = client.get_waiter("stack_exists")
try:
waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1})
return True
except botocore.exceptions.WaiterError as e:
return False


def create_or_update_stack(
args, *, stack_name, template_url=None, template_body=None, params=None
):
kwargs = {
"StackName": stack_name,
"Capabilities": [
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
"CAPABILITY_AUTO_EXPAND",
],
}
if template_url:
kwargs["TemplateURL"] = template_url
if template_body:
kwargs["TemplateBody"] = template_body
if params:
kwargs["Parameters"] = params

client = boto3.client("cloudformation", region_name=args.aws_region)

if stack_exists(args, stack_name=stack_name):
print(f"Stack {stack_name} already exists. Updating...")
try:
response = client.update_stack(**kwargs)
return {"StackName": stack_name, "Action": "update"}
except botocore.exceptions.ClientError as e:
if e.response["Error"]["Code"] == "ValidationError" and re.search(
"No updates are to be performed", e.response["Error"]["Message"]
):
print(f"No update was made to {stack_name}")
return {"StackName": stack_name, "Action": "noop"}
else:
raise e
else:
kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False})
response = client.create_stack(**kwargs)
return {"StackName": stack_name, "Action": "create"}


def wait(promise):
client = boto3.client("cloudformation", region_name=args.aws_region)
stack_name = promise["StackName"]
print(f"Waiting for {stack_name}...")
if promise["Action"] == "create":
waiter = client.get_waiter("stack_create_complete")
waiter.wait(StackName=stack_name)
print(f"Finished creating stack {stack_name}")
elif promise["Action"] == "update":
waiter = client.get_waiter("stack_update_complete")
waiter.wait(StackName=stack_name)
print(f"Finished updating stack {stack_name}")
elif promise["Action"] != "noop":
raise ValueError(f"Invalid promise {promise}")


def create_agent_iam_policy(args):
policy_stack_name = "buildkite-agent-iam-policy"
print(f"Creating stack {policy_stack_name} for agent IAM policy...")
with open(
os.path.join(current_dir, "agent-iam-policy-template.yml"),
encoding="utf-8",
) as f:
policy_template = f.read()
promise = create_or_update_stack(
args, stack_name=policy_stack_name, template_body=policy_template
)
wait(promise)

cf = boto3.resource("cloudformation", region_name=args.aws_region)
policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy")
return policy.physical_resource_id


def main(args):
agent_iam_policy = create_agent_iam_policy(args)

client = boto3.client("cloudformation", region_name=args.aws_region)

promises = []

for stack_id in AMI_ID:
stack_id_full = get_full_stack_id(stack_id)
print(f"Creating elastic CI stack {stack_id_full}...")

params = format_params(args, stack_id=stack_id)

response = client.create_stack(
StackName=stack_id_full,
TemplateURL=TEMPLATE_URL,
Capabilities=[
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
"CAPABILITY_AUTO_EXPAND",
],
OnFailure="ROLLBACK",
EnableTerminationProtection=False,
Parameters=params,
params = format_params(
args, stack_id=stack_id, agent_iam_policy=agent_iam_policy
)

promise = create_or_update_stack(
args, stack_name=stack_id_full, template_url=TEMPLATE_URL, params=params
)
promises.append(promise)
print(f"CI stack {stack_id_full} is in progress in the background")

for stack_id in AMI_ID:
stack_id_full = get_full_stack_id(stack_id)
waiter = client.get_waiter("stack_create_complete")
waiter.wait(StackName=stack_id_full)
print(f"CI stack {stack_id_full} is now finished.")
for promise in promises:
wait(promise)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"us-west-2": "ami-0a1a2ea551a07ad5f",
},
# Managed by BuildKite
# from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
"linux-amd64-cpu": {
"us-west-2": "ami-075d4c25d5f0c17c1",
},
Expand Down Expand Up @@ -105,7 +106,6 @@
"EnableCostAllocationTags": "true",
"CostAllocationTagName": "CreatedBy",
"ECRAccessPolicy": "full",
"ManagedPolicyARN": "arn:aws:iam::aws:policy/AmazonS3FullAccess,arn:aws:iam::aws:policy/service-role/AWSLambdaRole",
"EnableSecretsPlugin": "false",
"EnableECRPlugin": "false",
"EnableDockerLoginPlugin": "false",
Expand Down
11 changes: 9 additions & 2 deletions tests/buildkite/pipeline-mgpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,15 @@ steps:
- wait
- block: ":rocket: Run this test job"
if: build.pull_request.repository.fork == true
#### -------- CONTAINER BUILD --------
- label: ":docker: Build containers"
commands:
- "tests/buildkite/build-containers.sh gpu"
- "tests/buildkite/build-containers.sh jvm_gpu_build"
key: build-containers
agents:
queue: linux-amd64-cpu
- wait
#### -------- BUILD --------
- label: ":console: Build CUDA"
command: "tests/buildkite/build-cuda.sh"
Expand All @@ -24,9 +33,7 @@ steps:
key: build-jvm-packages-gpu
agents:
queue: linux-amd64-mgpu

- wait

#### -------- TEST --------
- label: ":console: Test Python package, 4 GPUs"
command: "tests/buildkite/test-python-gpu.sh mgpu"
Expand Down
14 changes: 10 additions & 4 deletions tests/buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@ steps:
- wait
- block: ":rocket: Run this test job"
if: build.pull_request.repository.fork == true
#### -------- CONTAINER BUILD --------
- label: ":docker: Build containers"
commands:
- "tests/buildkite/build-containers.sh gpu"
- "tests/buildkite/build-containers.sh rmm"
key: build-containers
agents:
queue: linux-amd64-cpu
- wait
#### -------- BUILD --------
- label: ":console: Run clang-tidy"
command: "tests/buildkite/run-clang-tidy.sh"
Expand Down Expand Up @@ -52,9 +61,7 @@ steps:
key: build-jvm-doc
agents:
queue: linux-amd64-cpu

- wait

#### -------- TEST --------
- label: ":console: Test Python package, CPU"
command: "tests/buildkite/test-python-cpu.sh"
Expand All @@ -81,9 +88,8 @@ steps:
key: test-integration-jvm-packages
agents:
queue: linux-amd64-cpu

- wait

#### -------- DEPLOY JVM --------
- label: ":console: Deploy JVM packages"
command: "tests/buildkite/deploy-jvm-packages.sh"
key: deploy-jvm-packages
Expand Down
4 changes: 4 additions & 0 deletions tests/buildkite/test-python-gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,15 @@ command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
# Run specified test suite
case "$suite" in
gpu)
export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/gpu)
set_buildkite_env_vars_in_container
echo "--- Test XGBoost Python package, single GPU"
$command_wrapper tests/ci_build/test_python.sh $suite
;;

mgpu)
export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/mgpu)
set_buildkite_env_vars_in_container
echo "--- Test XGBoost Python package, 4 GPUs"
$command_wrapper tests/ci_build/test_python.sh $suite
;;
Expand Down
4 changes: 3 additions & 1 deletion tests/ci_build/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ RUN \
python=3.9 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
pyspark cloudpickle cuda-python=11.7.0 && mamba clean --all
pyspark cloudpickle cuda-python=11.7.0 && \
mamba clean --all && \
conda run --no-capture-output -n gpu_test pip install buildkite-test-collector

ENV GOSU_VERSION 1.10
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
Expand Down
6 changes: 4 additions & 2 deletions tests/ci_build/Dockerfile.rmm
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ ENV PATH=/opt/python/bin:$PATH

# Create new Conda environment with RMM
RUN \
conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake
conda install -c conda-forge mamba && \
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
mamba clean --all

ENV GOSU_VERSION 1.10

Expand Down
Loading

0 comments on commit 50ff8a2

Please sign in to comment.