From 50ff8a26238b08d44862971f31b7beae1d57426b Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 6 Oct 2022 06:33:46 -0800
Subject: [PATCH] More CI improvements (#8313)

* Reduce clutter in log of Python test

* Set up BuildKite test analytics

* Add separate step for building containers

* Enable incremental update of CI stack; custom agent IAM policy
---
 tests/buildkite/build-containers.sh           |  36 +++++
 tests/buildkite/conftest.sh                   |  19 +++
 .../agent-iam-policy-template.yml             |  32 +++++
 .../aws-stack-creator/create_stack.py         | 123 +++++++++++++++---
 .../aws-stack-creator/metadata.py             |   2 +-
 tests/buildkite/pipeline-mgpu.yml             |  11 +-
 tests/buildkite/pipeline.yml                  |  14 +-
 tests/buildkite/test-python-gpu.sh            |   4 +
 tests/ci_build/Dockerfile.gpu                 |   4 +-
 tests/ci_build/Dockerfile.rmm                 |   6 +-
 tests/ci_build/test_python.sh                 |   9 +-
 11 files changed, 229 insertions(+), 31 deletions(-)
 create mode 100755 tests/buildkite/build-containers.sh
 create mode 100644 tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml

diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh
new file mode 100755
index 000000000000..b12da2a634ed
--- /dev/null
+++ b/tests/buildkite/build-containers.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+set -euo pipefail
+set -x
+
+if [ "$#" -lt 1 ]
+then
+  echo "Usage: $0 [container to build]"
+  return 1
+fi
+container=$1
+
+source tests/buildkite/conftest.sh
+
+echo "--- Build container ${container}"
+
+BUILD_ARGS=""
+
+case "${container}" in
+  gpu|rmm)
+    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
+    BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+    ;;
+
+  jvm_gpu_build)
+    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
+    ;;
+
+  *)
+    echo "Unrecognized container ID: ${container}"
+    return 2
+    ;;
+esac
+
+# Run a no-op command. This will simply build the container and push it to the private registry
+tests/ci_build/ci_build.sh ${container} docker ${BUILD_ARGS} bash
diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index dea3f75ac473..30ef4aeab3bc 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -1,6 +1,25 @@
 #!/bin/bash
 
 set -euo pipefail
+
+function get_aws_secret {
+  if [[ $# -ne 1 ]]
+  then
+    echo "Usage: get_aws_secret [Name of secret]"
+    return 1
+  fi
+  aws secretsmanager get-secret-value --secret-id $1 --output text --region us-west-2 --query SecretString
+}
+
+function set_buildkite_env_vars_in_container {
+  # Pass all Buildkite-specific env vars to Docker containers.
+  # This is to be used with tests/ci_build/ci_build.sh
+  export CI_DOCKER_EXTRA_PARAMS_INIT="${CI_DOCKER_EXTRA_PARAMS_INIT:-} "`
+    `"--env BUILDKITE_ANALYTICS_TOKEN --env BUILDKITE_BUILD_ID --env BUILDKITE_BUILD_NUMBER "`
+    `"--env BUILDKITE_JOB_ID --env BUILDKITE_BRANCH --env BUILDKITE_COMMIT "`
+    `"--env BUILDKITE_MESSAGE --env BUILDKITE_BUILD_URL"
+}
+
 set -x
 
 CUDA_VERSION=11.0.3
diff --git a/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml b/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml
new file mode 100644
index 000000000000..7f15b1fbcd4f
--- /dev/null
+++ b/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml
@@ -0,0 +1,32 @@
+---
+AWSTemplateFormatVersion: "2010-09-09"
+Description: "Buildkite agent's IAM policy"
+
+Resources:
+  BuildkiteAgentManagedPolicy:
+    Type: AWS::IAM::ManagedPolicy
+    Properties:
+      PolicyDocument:
+        {
+          "Version": "2012-10-17",
+          "Statement": [
+            {
+              "Effect": "Allow",
+              "Action": [
+                "s3:*",
+                "s3-object-lambda:*"
+              ],
+              "Resource": "*"
+            },
+            {
+              "Effect": "Allow",
+              "Action": "lambda:InvokeFunction",
+              "Resource": "*"
+            },
+            {
+              "Effect": "Allow",
+              "Action": "secretsmanager:GetSecretValue",
+              "Resource": "*"
+            }
+          ]
+        }
diff --git a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
index f4e92f634505..b9409de4cdcf 100644
--- a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
+++ b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
@@ -1,10 +1,14 @@
 import argparse
 import copy
+import os
+import re
 
 import boto3
-
+import botocore
 from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS
 
+current_dir = os.path.dirname(__file__)
+
 TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
 
 
@@ -33,7 +37,7 @@ def get_default_vpc(*, aws_region):
     return ec2.Vpc(default_vpc_id)
 
 
-def format_params(args, *, stack_id):
+def format_params(args, *, stack_id, agent_iam_policy):
     default_vpc = get_default_vpc(aws_region=args.aws_region)
     azs = get_availability_zones(aws_region=args.aws_region)
     # For each of the first two availability zones (AZs), choose the default subnet
@@ -55,6 +59,7 @@ def format_params(args, *, stack_id):
     params["BuildkiteAgentToken"] = args.agent_token
     params["VpcId"] = default_vpc.id
     params["Subnets"] = ",".join(subnets)
+    params["ManagedPolicyARN"] = agent_iam_policy
     params.update(COMMON_STACK_PARAMS)
     return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()]
 
@@ -63,34 +68,112 @@ def get_full_stack_id(stack_id):
     return f"buildkite-{stack_id}-autoscaling-group"
 
 
+def stack_exists(args, *, stack_name):
+    client = boto3.client("cloudformation", region_name=args.aws_region)
+    waiter = client.get_waiter("stack_exists")
+    try:
+        waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1})
+        return True
+    except botocore.exceptions.WaiterError as e:
+        return False
+
+
+def create_or_update_stack(
+    args, *, stack_name, template_url=None, template_body=None, params=None
+):
+    kwargs = {
+        "StackName": stack_name,
+        "Capabilities": [
+            "CAPABILITY_IAM",
+            "CAPABILITY_NAMED_IAM",
+            "CAPABILITY_AUTO_EXPAND",
+        ],
+    }
+    if template_url:
+        kwargs["TemplateURL"] = template_url
+    if template_body:
+        kwargs["TemplateBody"] = template_body
+    if params:
+        kwargs["Parameters"] = params
+
+    client = boto3.client("cloudformation", region_name=args.aws_region)
+
+    if stack_exists(args, stack_name=stack_name):
+        print(f"Stack {stack_name} already exists. Updating...")
+        try:
+            response = client.update_stack(**kwargs)
+            return {"StackName": stack_name, "Action": "update"}
+        except botocore.exceptions.ClientError as e:
+            if e.response["Error"]["Code"] == "ValidationError" and re.search(
+                "No updates are to be performed", e.response["Error"]["Message"]
+            ):
+                print(f"No update was made to {stack_name}")
+                return {"StackName": stack_name, "Action": "noop"}
+            else:
+                raise e
+    else:
+        kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False})
+        response = client.create_stack(**kwargs)
+        return {"StackName": stack_name, "Action": "create"}
+
+
+def wait(promise):
+    client = boto3.client("cloudformation", region_name=args.aws_region)
+    stack_name = promise["StackName"]
+    print(f"Waiting for {stack_name}...")
+    if promise["Action"] == "create":
+        waiter = client.get_waiter("stack_create_complete")
+        waiter.wait(StackName=stack_name)
+        print(f"Finished creating stack {stack_name}")
+    elif promise["Action"] == "update":
+        waiter = client.get_waiter("stack_update_complete")
+        waiter.wait(StackName=stack_name)
+        print(f"Finished updating stack {stack_name}")
+    elif promise["Action"] != "noop":
+        raise ValueError(f"Invalid promise {promise}")
+
+
+def create_agent_iam_policy(args):
+    policy_stack_name = "buildkite-agent-iam-policy"
+    print(f"Creating stack {policy_stack_name} for agent IAM policy...")
+    with open(
+        os.path.join(current_dir, "agent-iam-policy-template.yml"),
+        encoding="utf-8",
+    ) as f:
+        policy_template = f.read()
+    promise = create_or_update_stack(
+        args, stack_name=policy_stack_name, template_body=policy_template
+    )
+    wait(promise)
+
+    cf = boto3.resource("cloudformation", region_name=args.aws_region)
+    policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy")
+    return policy.physical_resource_id
+
+
 def main(args):
+    agent_iam_policy = create_agent_iam_policy(args)
+
     client = boto3.client("cloudformation", region_name=args.aws_region)
 
+    promises = []
+
     for stack_id in AMI_ID:
         stack_id_full = get_full_stack_id(stack_id)
         print(f"Creating elastic CI stack {stack_id_full}...")
 
-        params = format_params(args, stack_id=stack_id)
-
-        response = client.create_stack(
-            StackName=stack_id_full,
-            TemplateURL=TEMPLATE_URL,
-            Capabilities=[
-                "CAPABILITY_IAM",
-                "CAPABILITY_NAMED_IAM",
-                "CAPABILITY_AUTO_EXPAND",
-            ],
-            OnFailure="ROLLBACK",
-            EnableTerminationProtection=False,
-            Parameters=params,
+        params = format_params(
+            args, stack_id=stack_id, agent_iam_policy=agent_iam_policy
         )
+
+        promise = create_or_update_stack(
+            args, stack_name=stack_id_full, template_url=TEMPLATE_URL, params=params
+        )
+        promises.append(promise)
         print(f"CI stack {stack_id_full} is in progress in the background")
 
-    for stack_id in AMI_ID:
-        stack_id_full = get_full_stack_id(stack_id)
-        waiter = client.get_waiter("stack_create_complete")
-        waiter.wait(StackName=stack_id_full)
-        print(f"CI stack {stack_id_full} is now finished.")
+    for promise in promises:
+        wait(promise)
 
 
 if __name__ == "__main__":
diff --git a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
index 6e94c5108a00..edb4cc036f9c 100644
--- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
+++ b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
@@ -13,6 +13,7 @@
         "us-west-2": "ami-0a1a2ea551a07ad5f",
     },
     # Managed by BuildKite
+    # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
     "linux-amd64-cpu": {
         "us-west-2": "ami-075d4c25d5f0c17c1",
     },
@@ -105,7 +106,6 @@
     "EnableCostAllocationTags": "true",
     "CostAllocationTagName": "CreatedBy",
     "ECRAccessPolicy": "full",
-    "ManagedPolicyARN": "arn:aws:iam::aws:policy/AmazonS3FullAccess,arn:aws:iam::aws:policy/service-role/AWSLambdaRole",
     "EnableSecretsPlugin": "false",
     "EnableECRPlugin": "false",
     "EnableDockerLoginPlugin": "false",
diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml
index fa8f4988b9c5..690027da5009 100644
--- a/tests/buildkite/pipeline-mgpu.yml
+++ b/tests/buildkite/pipeline-mgpu.yml
@@ -13,6 +13,15 @@ steps:
   - wait
   - block: ":rocket: Run this test job"
     if: build.pull_request.repository.fork == true
+  #### -------- CONTAINER BUILD --------
+  - label: ":docker: Build containers"
+    commands:
+      - "tests/buildkite/build-containers.sh gpu"
+      - "tests/buildkite/build-containers.sh jvm_gpu_build"
+    key: build-containers
+    agents:
+      queue: linux-amd64-cpu
+  - wait
   #### -------- BUILD --------
   - label: ":console: Build CUDA"
     command: "tests/buildkite/build-cuda.sh"
@@ -24,9 +33,7 @@ steps:
     key: build-jvm-packages-gpu
     agents:
       queue: linux-amd64-mgpu
-
   - wait
-
   #### -------- TEST --------
   - label: ":console: Test Python package, 4 GPUs"
     command: "tests/buildkite/test-python-gpu.sh mgpu"
diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml
index 0062fc800ff5..8d6ab86d95f6 100644
--- a/tests/buildkite/pipeline.yml
+++ b/tests/buildkite/pipeline.yml
@@ -10,6 +10,15 @@ steps:
   - wait
   - block: ":rocket: Run this test job"
     if: build.pull_request.repository.fork == true
+  #### -------- CONTAINER BUILD --------
+  - label: ":docker: Build containers"
+    commands:
+      - "tests/buildkite/build-containers.sh gpu"
+      - "tests/buildkite/build-containers.sh rmm"
+    key: build-containers
+    agents:
+      queue: linux-amd64-cpu
+  - wait
   #### -------- BUILD --------
   - label: ":console: Run clang-tidy"
     command: "tests/buildkite/run-clang-tidy.sh"
@@ -52,9 +61,7 @@ steps:
     key: build-jvm-doc
     agents:
       queue: linux-amd64-cpu
-
   - wait
-
   #### -------- TEST --------
   - label: ":console: Test Python package, CPU"
     command: "tests/buildkite/test-python-cpu.sh"
@@ -81,9 +88,8 @@ steps:
     key: test-integration-jvm-packages
     agents:
       queue: linux-amd64-cpu
-
   - wait
-
+  #### -------- DEPLOY JVM --------
   - label: ":console: Deploy JVM packages"
     command: "tests/buildkite/deploy-jvm-packages.sh"
     key: deploy-jvm-packages
diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh
index f6e2096066ed..a575878d3c2b 100755
--- a/tests/buildkite/test-python-gpu.sh
+++ b/tests/buildkite/test-python-gpu.sh
@@ -29,11 +29,15 @@ command_wrapper="tests/ci_build/ci_build.sh gpu nvidia-docker --build-arg "`
 # Run specified test suite
 case "$suite" in
   gpu)
+    export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/gpu)
+    set_buildkite_env_vars_in_container
     echo "--- Test XGBoost Python package, single GPU"
     $command_wrapper tests/ci_build/test_python.sh $suite
     ;;
 
   mgpu)
+    export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/mgpu)
+    set_buildkite_env_vars_in_container
     echo "--- Test XGBoost Python package, 4 GPUs"
     $command_wrapper tests/ci_build/test_python.sh $suite
     ;;
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
index d0d684eedd76..5c7d19ee6ce7 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -25,7 +25,9 @@ RUN \
         python=3.9 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
         dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
         numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
-        pyspark cloudpickle cuda-python=11.7.0 && mamba clean --all
+        pyspark cloudpickle cuda-python=11.7.0 && \
+    mamba clean --all && \
+    conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
 
 ENV GOSU_VERSION 1.10
 ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm
index b0eefdaac408..0fbe44865456 100644
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -27,8 +27,10 @@ ENV PATH=/opt/python/bin:$PATH
 
 # Create new Conda environment with RMM
 RUN \
-    conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake
+    conda install -c conda-forge mamba && \
+    mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
+        python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
+    mamba clean --all
 
 ENV GOSU_VERSION 1.10
 
diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh
index fcf5cdc3cb3c..d4ec30f4114e 100755
--- a/tests/ci_build/test_python.sh
+++ b/tests/ci_build/test_python.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 set -e
-set -x
 
 if [ "$#" -lt 1 ]
 then
@@ -54,39 +53,47 @@ function uninstall_xgboost {
 case "$suite" in
   gpu)
     source activate gpu_test
+    set -x
     install_xgboost
     setup_pyspark_envs
     pytest -v -s -rxXs --fulltrace --durations=0 -m "not mgpu" ${args} tests/python-gpu
     unset_pyspark_envs
     uninstall_xgboost
+    set +x
     ;;
 
   mgpu)
     source activate gpu_test
+    set -x
     install_xgboost
     setup_pyspark_envs
     pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu
     unset_pyspark_envs
     uninstall_xgboost
+    set +x
     ;;
 
   cpu)
     source activate cpu_test
+    set -x
     install_xgboost
     export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1
     setup_pyspark_envs
     pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python
     unset_pyspark_envs
     uninstall_xgboost
+    set +x
     ;;
 
   cpu-arm64)
     source activate aarch64_test
+    set -x
     install_xgboost
     setup_pyspark_envs
     pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python/test_basic.py tests/python/test_basic_models.py tests/python/test_model_compatibility.py
     unset_pyspark_envs
     uninstall_xgboost
+    set +x
     ;;
 
   *)