Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

qa-prometheus setup to match production #3006

Merged
merged 1 commit into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 81 additions & 34 deletions qa-prometheus.planx-pla.net/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,52 +3,99 @@
"This is the dev environment manifest",
"That's all I have to say"
],
"jenkins": {
"autodeploy": "yes"
},
"versions": {
"arborist": "quay.io/cdis/arborist:2023.12",
"aws-es-proxy": "quay.io/cdis/aws-es-proxy:0.8",
"fence": "quay.io/cdis/fence:2023.12",
"fluentd": "fluent/fluentd-kubernetes-daemonset:v1.10.2-debian-cloudwatch-1.0",
"indexd": "quay.io/cdis/indexd:2023.12",
"jupyterhub": "quay.io/occ_data/jupyterhub:master",
"peregrine": "quay.io/cdis/peregrine:2023.12",
"pidgin": "quay.io/cdis/pidgin:2023.12",
"portal": "quay.io/cdis/data-portal:2023.12",
"revproxy": "quay.io/cdis/nginx:1.17.6-ctds-1.0.1",
"sheepdog": "quay.io/cdis/sheepdog:2023.12",
"spark": "quay.io/cdis/gen3-spark:2023.12",
"manifestservice": "quay.io/cdis/manifestservice:2023.12",
"wts": "quay.io/cdis/workspace-token-service:2023.12",
"tube": "quay.io/cdis/tube:master"
},
"arranger": {
"project_id": "dev",
"auth_filter_field": "gen3_resource_path",
"auth_filter_node_types": [
"subject"
]
"ambassador": "quay.io/datawire/ambassador:1.4.2",
"arborist": "quay.io/cdis/arborist:2024.05",
"aws-es-proxy": "quay.io/cdis/aws-es-proxy:v1.3.1",
"dashboard": "quay.io/cdis/gen3-statics:2024.05",
"fence": "quay.io/cdis/fence:10.2.0",
"fluentd": "fluent/fluentd-kubernetes-daemonset:v1.15.3-debian-cloudwatch-1.0",
"hatchery": "quay.io/cdis/hatchery:2.1.3",
"indexd": "quay.io/cdis/indexd:2024.05",
"kayako-wrapper": "quay.io/cdis/kayako-wrapper-service:0.2.0",
"manifestservice": "quay.io/cdis/manifestservice:2024.05",
"metadata": "quay.io/cdis/metadata-service:feat_pdc-filter",
"peregrine": "quay.io/cdis/peregrine:2024.05",
"portal": "quay.io/cdis/data-portal:2024.05",
"requestor": "quay.io/cdis/requestor:2024.05",
"revproxy": "quay.io/cdis/nginx:2024.05",
"sheepdog": "quay.io/cdis/sheepdog:2024.05",
"wts": "quay.io/cdis/workspace-token-service:2024.05",
"frontend-framework": "quay.io/cdis/prometheus-data-platform:main"
},
"arborist": {
"deployment_version": "2"
},
"jupyterhub": {
"enabled": "no"
"indexd": {
"arborist": "true"
},
"global": {
"environment": "qaplanetv2",
"hostname": "qaplanetv2.planx-pla.net",
"revproxy_arn": "arn:aws:acm:us-east-1:707767160287:certificate/c676c81c-9546-4e9a-9a72-725dd3912bc8",
"dictionary_url": "https://s3.amazonaws.com/dictionary-artifacts/datadictionary/develop/schema.json",
"portal_app": "dev",
"kube_bucket": "kube-qaplanetv2-gen3",
"logs_bucket": "logs-qaplanetv2-gen3",
"environment": "qa-prometheus",
"hostname": "qa-prometheus.planx-pla.net",
"revproxy_arn": "arn:aws:acm:us-east-1:851725307933:certificate/94dc6b22-6ad2-481f-aff5-59ad7381e9c6",
"dictionary_url": "https://s3.amazonaws.com/dictionary-artifacts/gtexdictionary/4.0.6/schema.json",
"dispatcher_job_num": "10",
"portal_app": "gitops",
"sync_from_dbgap": "False",
"useryaml_s3path": "s3://cdis-gen3-users/qa/user.yaml",
"kube_bucket": "kube_bucket.devplanetv1.gen3",
"logs_bucket": "logs-devplanetv1-gen3",
"useryaml_s3path": "s3://cdis-gen3-users/pdp/user.yaml",
"tier_access_level": "regular",
"tier_access_limit": 50,
"public_datasets": true,
"netpolicy": "on",
"lb_type": "internal",
"argocd": "true",
"es7": true
"waf_enabled": "true",
"pdb": "on",
"karpenter": "true",
"ecr-access-job-role-arn": "arn:aws:iam::654654631253:role/EcrRepoPolicyUpdateRole",
"frontend_root": "gen3ff"
},
"metadata": {
"USE_AGG_MDS": true,
"AGG_MDS_NAMESPACE": "pdp-commons"
},
"portal": {
"GEN3_BUNDLE": "ecosystem"
},
"canary": {
"default": 0
},
"scaling": {
"arborist": {
"strategy": "auto",
"min": 1,
"max": 1
},
"fence": {
"strategy": "auto",
"min": 1,
"max": 1
},
"indexd": {
"strategy": "auto",
"min": 1,
"max": 1
},
"revproxy": {
"strategy": "auto",
"min": 1,
"max": 1
},
"presigned-url-fence": {
"strategy": "auto",
"min": 1,
"max": 1,
"targetCpu": 40
},
"metadata": {
"strategy": "auto",
"min": 1,
"max": 1,
"targetCpu": 40
}
}
}
123 changes: 123 additions & 0 deletions qa-prometheus.planx-pla.net/manifests/scaling/awsnodetemplate.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
apiVersion: karpenter.k8s.aws/v1alpha1
kind: AWSNodeTemplate
metadata:
name: default
spec:
amiSelector:
aws::ids: ami-0d3eabf74e1e2258b
subnetSelector:
karpenter.sh/discovery: VPC_NAME
securityGroupSelector:
karpenter.sh/discovery: VPC_NAME
tags:
karpenter.sh/discovery: VPC_NAME
Environment: VPC_NAME
Name: eks-VPC_NAME-karpenter
purpose: default
metadataOptions:
httpEndpoint: enabled
httpProtocolIPv6: disabled
httpPutResponseHopLimit: 2
httpTokens: optional
userData: |
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="BOUNDARY"

--BOUNDARY
Content-Type: text/x-shellscript; charset="us-ascii"

#!/bin/bash -x
instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys

echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json

sysctl -w fs.inotify.max_user_watches=12000

sudo yum update -y
sudo yum install -y dracut-fips openssl >> /opt/fips-install.log
sudo dracut -f
# configure grub
sudo /sbin/grubby --update-kernel=ALL --args="fips=1"

--BOUNDARY

Content-Type: text/cloud-config; charset="us-ascii"

power_state:
delay: now
mode: reboot
message: Powering off
timeout: 2
condition: true


--BOUNDARY--
blockDeviceMappings:
- deviceName: /dev/xvda
ebs:
volumeSize: 50Gi
volumeType: gp2
encrypted: true
deleteOnTermination: true
---
apiVersion: karpenter.k8s.aws/v1alpha1
kind: AWSNodeTemplate
metadata:
name: jupyter
spec:
amiSelector:
aws::ids: ami-0d3eabf74e1e2258b
subnetSelector:
karpenter.sh/discovery: VPC_NAME
securityGroupSelector:
karpenter.sh/discovery: VPC_NAME-jupyter
tags:
Environment: VPC_NAME
Name: eks-VPC_NAME-jupyter-karpenter
karpenter.sh/discovery: VPC_NAME
purpose: jupyter
metadataOptions:
httpEndpoint: enabled
httpProtocolIPv6: disabled
httpPutResponseHopLimit: 2
httpTokens: optional
userData: |
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="BOUNDARY"

--BOUNDARY
Content-Type: text/x-shellscript; charset="us-ascii"

#!/bin/bash -x
instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys

echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json

sysctl -w fs.inotify.max_user_watches=12000

sudo yum update -y
sudo yum install -y dracut-fips openssl >> /opt/fips-install.log
sudo dracut -f
# configure grub
sudo /sbin/grubby --update-kernel=ALL --args="fips=1"

--BOUNDARY
Content-Type: text/cloud-config; charset="us-ascii"

power_state:
delay: now
mode: reboot
message: Powering off
timeout: 2
condition: true

--BOUNDARY--
blockDeviceMappings:
- deviceName: /dev/xvda
ebs:
volumeSize: 50Gi
volumeType: gp2
encrypted: true
deleteOnTermination: true
74 changes: 74 additions & 0 deletions qa-prometheus.planx-pla.net/manifests/scaling/provisioner.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
apiVersion: karpenter.sh/v1alpha5
kind: Provisioner
metadata:
name: default
spec:
# Allow for spot and on demand instances
requirements:
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand", "spot"]
- key: kubernetes.io/arch
operator: In
values:
- amd64
- key: karpenter.k8s.aws/instance-category
operator: In
values:
- c
- m
- r
- t
# Set a limit of 1000 vcpus
limits:
resources:
cpu: 1000
# Use the default node template
providerRef:
name: default
# Allow pods to be rearranged
consolidation:
enabled: true
# Kill nodes after 30 days to ensure they stay up to date
ttlSecondsUntilExpired: 2592000
---
apiVersion: karpenter.sh/v1alpha5
kind: Provisioner
metadata:
name: jupyter
spec:
# Only allow on demand instance
requirements:
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
- key: kubernetes.io/arch
operator: In
values:
- amd64
- key: karpenter.k8s.aws/instance-category
operator: In
values:
- c
- m
- r
- t
# Set a taint for jupyter pods
taints:
- key: role
value: jupyter
effect: NoSchedule
labels:
role: jupyter
# Set a limit of 1000 vcpus
limits:
resources:
cpu: 1000
# Use the jupyter node template
providerRef:
name: jupyter
# Allow pods to be rearranged
consolidation:
enabled: true
# Kill nodes after 30 days to ensure they stay up to date
ttlSecondsUntilExpired: 2592000
57 changes: 0 additions & 57 deletions qa-prometheus.planx-pla.net/manifests/scaling/scaling.json

This file was deleted.

Loading
Loading