From d862239e5552888b174f7aade37a0759fae03294 Mon Sep 17 00:00:00 2001 From: hakuna-matatah Date: Wed, 15 Nov 2023 22:23:23 -0800 Subject: [PATCH] Measure all SLOs that we run on KOPS AWS --- .../eks/awscli-cl2-load-with-addons-slos.yaml | 264 ++++++++++++++++++ .../generators/clusterloader/load-slos.yaml | 173 ++++++++++++ 2 files changed, 437 insertions(+) create mode 100644 tests/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml create mode 100644 tests/tasks/generators/clusterloader/load-slos.yaml diff --git a/tests/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml new file mode 100644 index 00000000..00dd831b --- /dev/null +++ b/tests/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -0,0 +1,264 @@ +apiVersion: tekton.dev/v1 +kind: Pipeline +metadata: + name: awscli-eks-cl2loadtest-with-addons-slos + namespace: scalability +spec: + finally: + - name: teardown + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: slack-hook + value: $(params.slack-hook) + - name: slack-message + value: $(params.slack-message) job completed + - name: service-role-stack-name + value: $(params.cluster-name)-service-role + - name: node-role-stack-name + value: $(params.cluster-name)-node-role + - name: launch-template-stack-name + value: $(params.cluster-name)-launch-template + retries: 10 + taskRef: + kind: Task + name: awscli-eks-cluster-teardown + params: + - name: cluster-name + type: string + - name: endpoint + type: string + - name: desired-nodes + type: string + - name: pods-per-node + type: string + - name: nodes-per-namespace + type: string + - name: cl2-load-test-throughput + type: string + - name: results-bucket + type: string + - default: "" + name: slack-hook + type: string + - name: slack-message + type: string + - name: amp-workspace-id + type: string + - name: vpc-cfn-url + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/amazon-eks-vpc.json" + type: string + - name: ng-cfn-url + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_nodeGroup_LaunchTemplate.yaml" + type: string + - name: kubernetes-version + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_service_role.json + name: service-role-cfn-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_node_role.json + name: node-role-cfn-url + type: string + tasks: + - name: slack-notification + params: + - name: slack-hook + value: $(params.slack-hook) + - name: slack-message + value: $(params.slack-message) job kicked off + taskRef: + kind: Task + name: slack-notification + - name: create-cluster-service-role + params: + - name: stack-name + value: $(params.cluster-name)-service-role + - name: role-cfn-url + value: $(params.service-role-cfn-url) + - name: role-name + value: $(params.cluster-name)-service-role + runAfter: + - slack-notification + taskRef: + kind: Task + name: awscli-role-create + - name: awscli-vpc-create + params: + - name: stack-name + value: $(params.cluster-name) + - name: vpc-cfn-url + value: $(params.vpc-cfn-url) + taskRef: + kind: Task + name: awscli-vpc-create + - name: create-cluster-node-role + params: + - name: stack-name + value: $(params.cluster-name)-node-role + - name: role-cfn-url + value: $(params.node-role-cfn-url) + - name: role-name + value: $(params.cluster-name)-node-role + runAfter: + - slack-notification + taskRef: + kind: Task + name: awscli-role-create + - name: create-eks-cluster + params: + - name: cluster-name + value: $(params.cluster-name) + - name: service-role-name + value: $(params.cluster-name)-service-role + - name: endpoint + value: $(params.endpoint) + - name: vpc-stack-name + value: $(params.cluster-name) + - name: kubernetes-version + value: $(params.kubernetes-version) + retries: 3 + runAfter: + - create-cluster-node-role + - create-cluster-service-role + - awscli-vpc-create + taskRef: + kind: Task + name: awscli-eks-cluster-create-with-vpc-stack + workspaces: + - name: config + workspace: config + - name: create-launch-template + params: + - name: cluster-name + value: $(params.cluster-name) + - name: stack-name + value: $(params.cluster-name)-launch-template + - name: kubernetes-version + value: "$(params.kubernetes-version)" + - name: ng-cfn-url + value: "$(params.ng-cfn-url)" + - name: endpoint + value: $(params.endpoint) + runAfter: + - create-eks-cluster + taskRef: + kind: Task + name: awscli-eks-cfn-launch-template + workspaces: + - name: config + workspace: config + - name: create-mng-monitoring-nodes + params: + - name: cluster-name + value: $(params.cluster-name) + - name: host-cluster-node-role-name + value: $(params.cluster-name)-node-role + - name: endpoint + value: $(params.endpoint) + - name: desired-nodes + value: "1" + - name: max-nodes + value: "1" + - name: host-instance-types + value: m5.4xlarge + - name: host-taints + value: key=monitoring,value=true,effect=NO_SCHEDULE + - name: nodegroup-prefix + value: monitoring- + runAfter: + - create-launch-template + taskRef: + kind: Task + name: awscli-eks-nodegroup-create + workspaces: + - name: config + workspace: config + - name: install-fluentbit-addon + params: + - name: cluster-name + value: $(params.cluster-name) + runAfter: + - create-mng-monitoring-nodes + taskRef: + kind: Task + name: eks-addon-fluentbit + workspaces: + - name: config + workspace: config + - name: create-mng-nodes + params: + - name: cluster-name + value: $(params.cluster-name) + - name: desired-nodes + value: $(params.desired-nodes) + - name: host-cluster-node-role-name + value: $(params.cluster-name)-node-role + - name: endpoint + value: $(params.endpoint) + runAfter: + - install-fluentbit-addon + taskRef: + kind: Task + name: awscli-eks-nodegroup-create + workspaces: + - name: config + workspace: config + - name: create-cw-agent-addon + params: + - name: cluster-name + value: $(params.cluster-name) + runAfter: + - create-mng-nodes + taskRef: + kind: Task + name: eks-addon-cwagent + workspaces: + - name: config + workspace: config + - name: generate + params: + - name: cluster-name + value: $(params.cluster-name) + - name: pods-per-node + value: $(params.pods-per-node) + - name: nodes-per-namespace + value: $(params.nodes-per-namespace) + - name: cl2-load-test-throughput + value: $(params.cl2-load-test-throughput) + - name: results-bucket + value: $(params.results-bucket) + - name: nodes + value: $(params.desired-nodes) + - name: amp-workspace-id + value: $(params.amp-workspace-id) + runAfter: + - create-cw-agent-addon + taskRef: + kind: Task + name: load-slos + workspaces: + - name: source + workspace: source + - name: results + workspace: results + - name: config + workspace: config + - name: cw-metrics + params: + - name: dimensions + value: $(params.desired-nodes) + - name: value + value: $(tasks.generate.results.datapoint) + - name: namespace + value: $(params.kubernetes-version) + runAfter: + - generate + taskRef: + kind: Task + name: cloudwatch + workspaces: + - name: source + - name: results + - name: config \ No newline at end of file diff --git a/tests/tasks/generators/clusterloader/load-slos.yaml b/tests/tasks/generators/clusterloader/load-slos.yaml new file mode 100644 index 00000000..59fba116 --- /dev/null +++ b/tests/tasks/generators/clusterloader/load-slos.yaml @@ -0,0 +1,173 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: load-slos + namespace: scalability +spec: + description: "clusterloader2 task to run various types of cl2 tests on a given cluster." + params: + - name: giturl + description: "git url to clone the package" + default: https://github.com/hakuna-matatah/perf-tests.git + - name: cl2-branch + description: "The branch of clusterloader2 you want to use" + default: "master" + - name: nodes-per-namespace + description: "nodes per namespace to get created for load test " + default: "100" + - name: cl2-load-test-throughput + description: " throughput used for mutate operations" + default: "15" + - name: pods-per-node + description: "pod density" + default: "10" + - name: nodes + description: "number of dataplane nodes to run the load test against" + default: "1000" + - name: results-bucket + description: "Results bucket with path of s3 to upload results" + - name: region + default: "us-west-2" + description: The region where the cluster is in. + - name: cluster-name + description: The name of the EKS cluster you want to spin. + - name: amp-workspace-id + description: The AMP workspace ID where remote write needs to happen. + default: "" + results: + - name: datapoint + description: Stores the CL2 result that can be consumed by other tasks (e.g. cloudwatch) + - name: s3_result + description: Stores the S3 result path after compute + workspaces: + - name: source + mountPath: /src/k8s.io/ + - name: results + - name: config + mountPath: /config/ + stepTemplate: + env: + - name: KUBECONFIG + value: /config/kubeconfig + steps: + - name: git-clone + image: alpine/git + workingDir: $(workspaces.source.path) + script: | + git clone $(params.giturl) + cd $(workspaces.source.path)/perf-tests/ + git fetch origin --verbose --tags + git checkout $(params.cl2-branch) + git branch + - name: prepare-loadtest + image: golang:1.19 + workingDir: $(workspaces.source.path) + script: | + S3_RESULT_PATH=$(params.results-bucket) + echo $S3_RESULT_PATH > $(results.s3_result.path) + echo "S3 Path: $S3_RESULT_PATH" + cat > "$(workspaces.source.path)/overrides.yaml" <> $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml + containers: + - name: aws-sigv4-proxy-sidecar + image: public.ecr.aws/aws-observability/aws-sigv4-proxy:1.0 + args: + - --name + - aps + - --region + - $(params.region) + - --host + - aps-workspaces.$(params.region).amazonaws.com + - --port + - :8005 + ports: + - name: aws-sigv4-proxy + containerPort: 8005 + remoteWrite: + - url: http://localhost:8005/workspaces/$(params.amp-workspace-id)/api/v1/remote_write + queueConfig: + capacity: 2500 + maxSamplesPerSend: 1000 + maxShards: 200 + externalLabels: + cluster_name: $(params.cluster-name) + s3_path: $S3_RESULT_PATH + EOF + cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml + cat << EOF >> $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/0prometheus-operator-deployment.yaml + tolerations: + - key: monitoring + operator: Exists + effect: NoSchedule + EOF + cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/0prometheus-operator-deployment.yaml + fi + # Building clusterloader2 binary + cd $(workspaces.source.path)/perf-tests/clusterloader2/ + GOOS=linux CGO_ENABLED=0 go build -v -o ./clusterloader ./cmd + - name: run-loadtest + image: alpine/k8s:1.23.7 + onError: continue + script: | + #!/bin/bash + if [ -n "$(params.amp-workspace-id)" ]; then + CL2_PROMETHEUS_FLAGS="--enable-prometheus-server=true --prometheus-pvc-storage-class gp2" + fi + cat $(workspaces.source.path)/perf-tests/clusterloader2/testing/load/config.yaml + cd $(workspaces.source.path)/perf-tests/clusterloader2/ + ENABLE_EXEC_SERVICE=false ./clusterloader --kubeconfig=$KUBECONFIG --testconfig=$(workspaces.source.path)/perf-tests/clusterloader2/testing/load/config.yaml --testoverrides=$(workspaces.source.path)/overrides.yaml --nodes=$(params.nodes) --provider=eks --report-dir=$(workspaces.results.path) --alsologtostderr --v=2 $CL2_PROMETHEUS_FLAGS + exit_code=$? + if [ $exit_code -eq 0 ]; then + echo "1" | tee $(results.datapoint.path) + else + echo "0" | tee $(results.datapoint.path) + fi + exit $exit_code + timeout: 30000s + - name: upload-results + image: amazon/aws-cli + workingDir: $(workspaces.results.path) + script: | + S3_RESULT_PATH=$(cat $(results.s3_result.path)) + echo "S3 Path: $S3_RESULT_PATH" + aws sts get-caller-identity + # we expect to see all files from loadtest that clusterloader2 outputs here in this dir + ls -larth + aws s3 cp . s3://$S3_RESULT_PATH/ --recursive \ No newline at end of file