From 50ff8a26238b08d44862971f31b7beae1d57426b Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Thu, 6 Oct 2022 06:33:46 -0800 Subject: [PATCH] More CI improvements (#8313) * Reduce clutter in log of Python test * Set up BuildKite test analytics * Add separate step for building containers * Enable incremental update of CI stack; custom agent IAM policy --- tests/buildkite/build-containers.sh | 36 +++++ tests/buildkite/conftest.sh | 19 +++ .../agent-iam-policy-template.yml | 32 +++++ .../aws-stack-creator/create_stack.py | 123 +++++++++++++++--- .../aws-stack-creator/metadata.py | 2 +- tests/buildkite/pipeline-mgpu.yml | 11 +- tests/buildkite/pipeline.yml | 14 +- tests/buildkite/test-python-gpu.sh | 4 + tests/ci_build/Dockerfile.gpu | 4 +- tests/ci_build/Dockerfile.rmm | 6 +- tests/ci_build/test_python.sh | 9 +- 11 files changed, 229 insertions(+), 31 deletions(-) create mode 100755 tests/buildkite/build-containers.sh create mode 100644 tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh new file mode 100755 index 000000000000..b12da2a634ed --- /dev/null +++ b/tests/buildkite/build-containers.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -euo pipefail +set -x + +if [ "$#" -lt 1 ] +then + echo "Usage: $0 [container to build]" + return 1 +fi +container=$1 + +source tests/buildkite/conftest.sh + +echo "--- Build container ${container}" + +BUILD_ARGS="" + +case "${container}" in + gpu|rmm) + BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" + BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" + ;; + + jvm_gpu_build) + BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" + ;; + + *) + echo "Unrecognized container ID: ${container}" + return 2 + ;; +esac + +# Run a no-op command. This will simply build the container and push it to the private registry +tests/ci_build/ci_build.sh ${container} docker ${BUILD_ARGS} bash diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh index dea3f75ac473..30ef4aeab3bc 100755 --- a/tests/buildkite/conftest.sh +++ b/tests/buildkite/conftest.sh @@ -1,6 +1,25 @@ #!/bin/bash set -euo pipefail + +function get_aws_secret { + if [[ $# -ne 1 ]] + then + echo "Usage: get_aws_secret [Name of secret]" + return 1 + fi + aws secretsmanager get-secret-value --secret-id $1 --output text --region us-west-2 --query SecretString +} + +function set_buildkite_env_vars_in_container { + # Pass all Buildkite-specific env vars to Docker containers. + # This is to be used with tests/ci_build/ci_build.sh + export CI_DOCKER_EXTRA_PARAMS_INIT="${CI_DOCKER_EXTRA_PARAMS_INIT:-} "` + `"--env BUILDKITE_ANALYTICS_TOKEN --env BUILDKITE_BUILD_ID --env BUILDKITE_BUILD_NUMBER "` + `"--env BUILDKITE_JOB_ID --env BUILDKITE_BRANCH --env BUILDKITE_COMMIT "` + `"--env BUILDKITE_MESSAGE --env BUILDKITE_BUILD_URL" +} + set -x CUDA_VERSION=11.0.3 diff --git a/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml b/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml new file mode 100644 index 000000000000..7f15b1fbcd4f --- /dev/null +++ b/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml @@ -0,0 +1,32 @@ +--- +AWSTemplateFormatVersion: "2010-09-09" +Description: "Buildkite agent's IAM policy" + +Resources: + BuildkiteAgentManagedPolicy: + Type: AWS::IAM::ManagedPolicy + Properties: + PolicyDocument: + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:*", + "s3-object-lambda:*" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": "lambda:InvokeFunction", + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": "secretsmanager:GetSecretValue", + "Resource": "*" + } + ] + } diff --git a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py index f4e92f634505..b9409de4cdcf 100644 --- a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py +++ b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py @@ -1,10 +1,14 @@ import argparse import copy +import os +import re import boto3 - +import botocore from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS +current_dir = os.path.dirname(__file__) + TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml" @@ -33,7 +37,7 @@ def get_default_vpc(*, aws_region): return ec2.Vpc(default_vpc_id) -def format_params(args, *, stack_id): +def format_params(args, *, stack_id, agent_iam_policy): default_vpc = get_default_vpc(aws_region=args.aws_region) azs = get_availability_zones(aws_region=args.aws_region) # For each of the first two availability zones (AZs), choose the default subnet @@ -55,6 +59,7 @@ def format_params(args, *, stack_id): params["BuildkiteAgentToken"] = args.agent_token params["VpcId"] = default_vpc.id params["Subnets"] = ",".join(subnets) + params["ManagedPolicyARN"] = agent_iam_policy params.update(COMMON_STACK_PARAMS) return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] @@ -63,34 +68,112 @@ def get_full_stack_id(stack_id): return f"buildkite-{stack_id}-autoscaling-group" +def stack_exists(args, *, stack_name): + client = boto3.client("cloudformation", region_name=args.aws_region) + waiter = client.get_waiter("stack_exists") + try: + waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1}) + return True + except botocore.exceptions.WaiterError as e: + return False + + +def create_or_update_stack( + args, *, stack_name, template_url=None, template_body=None, params=None +): + kwargs = { + "StackName": stack_name, + "Capabilities": [ + "CAPABILITY_IAM", + "CAPABILITY_NAMED_IAM", + "CAPABILITY_AUTO_EXPAND", + ], + } + if template_url: + kwargs["TemplateURL"] = template_url + if template_body: + kwargs["TemplateBody"] = template_body + if params: + kwargs["Parameters"] = params + + client = boto3.client("cloudformation", region_name=args.aws_region) + + if stack_exists(args, stack_name=stack_name): + print(f"Stack {stack_name} already exists. Updating...") + try: + response = client.update_stack(**kwargs) + return {"StackName": stack_name, "Action": "update"} + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "ValidationError" and re.search( + "No updates are to be performed", e.response["Error"]["Message"] + ): + print(f"No update was made to {stack_name}") + return {"StackName": stack_name, "Action": "noop"} + else: + raise e + else: + kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False}) + response = client.create_stack(**kwargs) + return {"StackName": stack_name, "Action": "create"} + + +def wait(promise): + client = boto3.client("cloudformation", region_name=args.aws_region) + stack_name = promise["StackName"] + print(f"Waiting for {stack_name}...") + if promise["Action"] == "create": + waiter = client.get_waiter("stack_create_complete") + waiter.wait(StackName=stack_name) + print(f"Finished creating stack {stack_name}") + elif promise["Action"] == "update": + waiter = client.get_waiter("stack_update_complete") + waiter.wait(StackName=stack_name) + print(f"Finished updating stack {stack_name}") + elif promise["Action"] != "noop": + raise ValueError(f"Invalid promise {promise}") + + +def create_agent_iam_policy(args): + policy_stack_name = "buildkite-agent-iam-policy" + print(f"Creating stack {policy_stack_name} for agent IAM policy...") + with open( + os.path.join(current_dir, "agent-iam-policy-template.yml"), + encoding="utf-8", + ) as f: + policy_template = f.read() + promise = create_or_update_stack( + args, stack_name=policy_stack_name, template_body=policy_template + ) + wait(promise) + + cf = boto3.resource("cloudformation", region_name=args.aws_region) + policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy") + return policy.physical_resource_id + + def main(args): + agent_iam_policy = create_agent_iam_policy(args) + client = boto3.client("cloudformation", region_name=args.aws_region) + promises = [] + for stack_id in AMI_ID: stack_id_full = get_full_stack_id(stack_id) print(f"Creating elastic CI stack {stack_id_full}...") - params = format_params(args, stack_id=stack_id) - - response = client.create_stack( - StackName=stack_id_full, - TemplateURL=TEMPLATE_URL, - Capabilities=[ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - "CAPABILITY_AUTO_EXPAND", - ], - OnFailure="ROLLBACK", - EnableTerminationProtection=False, - Parameters=params, + params = format_params( + args, stack_id=stack_id, agent_iam_policy=agent_iam_policy ) + + promise = create_or_update_stack( + args, stack_name=stack_id_full, template_url=TEMPLATE_URL, params=params + ) + promises.append(promise) print(f"CI stack {stack_id_full} is in progress in the background") - for stack_id in AMI_ID: - stack_id_full = get_full_stack_id(stack_id) - waiter = client.get_waiter("stack_create_complete") - waiter.wait(StackName=stack_id_full) - print(f"CI stack {stack_id_full} is now finished.") + for promise in promises: + wait(promise) if __name__ == "__main__": diff --git a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py index 6e94c5108a00..edb4cc036f9c 100644 --- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py +++ b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py @@ -13,6 +13,7 @@ "us-west-2": "ami-0a1a2ea551a07ad5f", }, # Managed by BuildKite + # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml "linux-amd64-cpu": { "us-west-2": "ami-075d4c25d5f0c17c1", }, @@ -105,7 +106,6 @@ "EnableCostAllocationTags": "true", "CostAllocationTagName": "CreatedBy", "ECRAccessPolicy": "full", - "ManagedPolicyARN": "arn:aws:iam::aws:policy/AmazonS3FullAccess,arn:aws:iam::aws:policy/service-role/AWSLambdaRole", "EnableSecretsPlugin": "false", "EnableECRPlugin": "false", "EnableDockerLoginPlugin": "false", diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml index fa8f4988b9c5..690027da5009 100644 --- a/tests/buildkite/pipeline-mgpu.yml +++ b/tests/buildkite/pipeline-mgpu.yml @@ -13,6 +13,15 @@ steps: - wait - block: ":rocket: Run this test job" if: build.pull_request.repository.fork == true + #### -------- CONTAINER BUILD -------- + - label: ":docker: Build containers" + commands: + - "tests/buildkite/build-containers.sh gpu" + - "tests/buildkite/build-containers.sh jvm_gpu_build" + key: build-containers + agents: + queue: linux-amd64-cpu + - wait #### -------- BUILD -------- - label: ":console: Build CUDA" command: "tests/buildkite/build-cuda.sh" @@ -24,9 +33,7 @@ steps: key: build-jvm-packages-gpu agents: queue: linux-amd64-mgpu - - wait - #### -------- TEST -------- - label: ":console: Test Python package, 4 GPUs" command: "tests/buildkite/test-python-gpu.sh mgpu" diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml index 0062fc800ff5..8d6ab86d95f6 100644 --- a/tests/buildkite/pipeline.yml +++ b/tests/buildkite/pipeline.yml @@ -10,6 +10,15 @@ steps: - wait - block: ":rocket: Run this test job" if: build.pull_request.repository.fork == true + #### -------- CONTAINER BUILD -------- + - label: ":docker: Build containers" + commands: + - "tests/buildkite/build-containers.sh gpu" + - "tests/buildkite/build-containers.sh rmm" + key: build-containers + agents: + queue: linux-amd64-cpu + - wait #### -------- BUILD -------- - label: ":console: Run clang-tidy" command: "tests/buildkite/run-clang-tidy.sh" @@ -52,9 +61,7 @@ steps: key: build-jvm-doc agents: queue: linux-amd64-cpu - - wait - #### -------- TEST -------- - label: ":console: Test Python package, CPU" command: "tests/buildkite/test-python-cpu.sh" @@ -81,9 +88,8 @@ steps: key: test-integration-jvm-packages agents: queue: linux-amd64-cpu - - wait - + #### -------- DEPLOY JVM -------- - label: ":console: Deploy JVM packages" command: "tests/buildkite/deploy-jvm-packages.sh" key: deploy-jvm-packages diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh index f6e2096066ed..a575878d3c2b 100755 --- a/tests/buildkite/test-python-gpu.sh +++ b/tests/buildkite/test-python-gpu.sh @@ -29,11 +29,15 @@ command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "` # Run specified test suite case "$suite" in gpu) + export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/gpu) + set_buildkite_env_vars_in_container echo "--- Test XGBoost Python package, single GPU" $command_wrapper tests/ci_build/test_python.sh $suite ;; mgpu) + export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/mgpu) + set_buildkite_env_vars_in_container echo "--- Test XGBoost Python package, 4 GPUs" $command_wrapper tests/ci_build/test_python.sh $suite ;; diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu index d0d684eedd76..5c7d19ee6ce7 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/tests/ci_build/Dockerfile.gpu @@ -25,7 +25,9 @@ RUN \ python=3.9 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \ dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ - pyspark cloudpickle cuda-python=11.7.0 && mamba clean --all + pyspark cloudpickle cuda-python=11.7.0 && \ + mamba clean --all && \ + conda run --no-capture-output -n gpu_test pip install buildkite-test-collector ENV GOSU_VERSION 1.10 ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/ diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm index b0eefdaac408..0fbe44865456 100644 --- a/tests/ci_build/Dockerfile.rmm +++ b/tests/ci_build/Dockerfile.rmm @@ -27,8 +27,10 @@ ENV PATH=/opt/python/bin:$PATH # Create new Conda environment with RMM RUN \ - conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \ - python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake + conda install -c conda-forge mamba && \ + mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \ + python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \ + mamba clean --all ENV GOSU_VERSION 1.10 diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh index fcf5cdc3cb3c..d4ec30f4114e 100755 --- a/tests/ci_build/test_python.sh +++ b/tests/ci_build/test_python.sh @@ -1,6 +1,5 @@ #!/bin/bash set -e -set -x if [ "$#" -lt 1 ] then @@ -54,39 +53,47 @@ function uninstall_xgboost { case "$suite" in gpu) source activate gpu_test + set -x install_xgboost setup_pyspark_envs pytest -v -s -rxXs --fulltrace --durations=0 -m "not mgpu" ${args} tests/python-gpu unset_pyspark_envs uninstall_xgboost + set +x ;; mgpu) source activate gpu_test + set -x install_xgboost setup_pyspark_envs pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu unset_pyspark_envs uninstall_xgboost + set +x ;; cpu) source activate cpu_test + set -x install_xgboost export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 setup_pyspark_envs pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python unset_pyspark_envs uninstall_xgboost + set +x ;; cpu-arm64) source activate aarch64_test + set -x install_xgboost setup_pyspark_envs pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python/test_basic.py tests/python/test_basic_models.py tests/python/test_model_compatibility.py unset_pyspark_envs uninstall_xgboost + set +x ;; *)