feat: Adding CPU / RAM configurations to helm network deployments (#8786

) # Change 1: CPU/RAM Limits for node deployments This PR assigns resource configurations to nodes that are part of helm network deployments. Adding such resource configurations helps Kubernetes balance and deploy aztec nodes. These initial values are chosen based on historical usage of the currently deployed `devnet` environment in AWS ( [Grafana Dashboard](https://grafana.aztec.network/d/cdtxao66xa1ogc/aztec-dashboard?orgId=1&refresh=1m&var-network=devnet&var-instance=All&var-protocol_circuit=All&var-min_block_build=20m&var-system_res_interval=$__auto_interval_system_res_interval&var-sequencer=All&var-prover=All&from=now-7d&to=now) ). **Definitions** `requests:` This is the minimum resource that must be available on the underlying server before Kubernetes can deploy the component. `limits:` After deployment, the component is allowed to flex up and down, but never above this set limit. Using a limit keeps the shared infra stable when there is memory leaks or unexpected application behavior. Components are terminated and redeployed if exceeding the assigned limit. # Change 2: Options for bots and public networks Additionally, this PR add configuration to turn bots as well as public access on or off at the time of the helm deployment. This can be used with the following helm syntax: ``` helm upgrade --install <installation name> . -n <kubernetes namespace> \ --set network.public=true --set network.enableBots=true ``` By default, `network.public` is `false` since enabling this deploys load balancers which are not available when running a Kubernetes cluster on a local machine and within CI environments. --- These resource configurations have been tested by deploying the parent helm chart to the spartan Kubernetes cluster in AWS.
AztecProtocol · Sep 26, 2024 · 7790ede · 7790ede
1 parent 5e4cfa7
commit 7790ede
Show file tree

Hide file tree

Showing 7 changed files with 60 additions and 19 deletions.
diff --git a/spartan/aztec-network/templates/boot-node.yaml b/spartan/aztec-network/templates/boot-node.yaml
@@ -136,7 +136,7 @@ metadata:
   labels:
     {{- include "aztec-network.labels" . | nindent 4 }}
 spec:
-  clusterIP: None
+  type: {{if .Values.network.public }}"LoadBalancer"{{ else }}"ClusterIP"{{ end }}
   selector:
     {{- include "aztec-network.selectorLabels" . | nindent 4 }}
     app: boot-node

diff --git a/spartan/aztec-network/templates/metrics.yaml b/spartan/aztec-network/templates/metrics.yaml
@@ -78,7 +78,7 @@ metadata:
   labels:
     {{- include "aztec-network.labels" . | nindent 4 }}
 spec:
-  type: {{ .Values.metrics.service.type }}
+  type: {{if .Values.network.public }}"LoadBalancer"{{ else }}"ClusterIP"{{ end }}
   ports:
     - port: {{ .Values.metrics.ports.otlp }}
       targetPort: otlp

diff --git a/spartan/aztec-network/templates/prover-node.yaml b/spartan/aztec-network/templates/prover-node.yaml
@@ -106,7 +106,7 @@ metadata:
   labels:
     {{- include "aztec-network.labels" . | nindent 4 }}
 spec:
-  clusterIP: None
+  type: {{if .Values.network.public }}"LoadBalancer"{{ else }}"ClusterIP"{{ end }}
   selector:
     {{- include "aztec-network.selectorLabels" . | nindent 4 }}
     app: prover-node

diff --git a/spartan/aztec-network/templates/pxe.yaml b/spartan/aztec-network/templates/pxe.yaml
@@ -1,3 +1,4 @@
+{{- if .Values.network.enableBots }}
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -65,7 +66,7 @@ metadata:
   labels:
     {{- include "aztec-network.labels" . | nindent 4 }}
 spec:
-  type: {{ .Values.pxe.service.type }}
+  type: {{if .Values.network.public }}"LoadBalancer"{{ else }}"ClusterIP"{{ end }}
   selector:
     {{- include "aztec-network.selectorLabels" . | nindent 4 }}
     app: pxe
@@ -75,4 +76,5 @@ spec:
       targetPort: {{ .Values.pxe.service.targetPort }}
       {{- if and (eq .Values.pxe.service.type "NodePort") .Values.pxe.service.nodePort }}
       nodePort: {{ .Values.pxe.service.nodePort }}
-      {{- end }}
+      {{- end }}
+{{- end }}
diff --git a/spartan/aztec-network/templates/transaction-bot.yaml b/spartan/aztec-network/templates/transaction-bot.yaml
@@ -1,3 +1,4 @@
+{{- if .Values.network.enableBots }}
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -99,4 +100,5 @@ spec:
       targetPort: {{ .Values.bot.service.targetPort }}
       {{- if and (eq .Values.bot.service.type "NodePort") .Values.bot.service.nodePort }}
       nodePort: {{ .Values.bot.service.nodePort }}
-      {{- end }}
+      {{- end }}
+{{- end }}
diff --git a/spartan/aztec-network/templates/validator.yaml b/spartan/aztec-network/templates/validator.yaml
@@ -115,7 +115,7 @@ metadata:
   labels:
     {{- include "aztec-network.labels" . | nindent 4 }}
 spec:
-  clusterIP: None
+  type: {{if .Values.network.public }}"LoadBalancer"{{ else }}"ClusterIP"{{ end }}
   selector:
     {{- include "aztec-network.selectorLabels" . | nindent 4 }}
     app: validator

diff --git a/spartan/aztec-network/values.yaml b/spartan/aztec-network/values.yaml
@@ -1,3 +1,10 @@
+test: "sample"
+scenario: "default"
+
+network:
+  public: false
+  enableBots: true
+
 images:
   test:
     image: aztecprotocol/end-to-end
@@ -38,7 +45,13 @@ bootNode:
     disabled: true
   p2p:
     enabled: "true"
-  resources: {}
+  resources:
+    requests:
+      memory: "2Gi"
+      cpu: "200m"
+    limits:
+      memory: "4Gi"
+      cpu: "4"
 
 validator:
   replicas: 0
@@ -54,7 +67,13 @@ validator:
     disabled: false
   p2p:
     enabled: "true"
-  resources: {}
+  resources:
+    requests:
+      memory: "2Gi"
+      cpu: "200m"
+    limits:
+      memory: "28Gi"
+      cpu: "7"
 
 proverNode:
   replicas: 1
@@ -64,14 +83,19 @@ proverNode:
   debug: "aztec:*"
   realProofs: false
   proverAgentEnabled: true
-  resources: {}
+  resources:
+    requests:
+      memory: "2Gi"
+      cpu: "200m"
+    limits:
+      memory: "120Gi"
+      cpu: "15"
 
 pxe:
   logLevel: "debug"
   debug: "aztec:*"
   replicas: 1
   service:
-    type: ClusterIP
     port: 8080
     targetPort: 8080
   readinessProbe:
@@ -80,7 +104,13 @@ pxe:
     timeoutSeconds: 5
     successThreshold: 1
     failureThreshold: 3
-  resources: {}
+  resources:
+    requests:
+      memory: "2Gi"
+      cpu: "200m"
+    limits:
+      memory: "4Gi"
+      cpu: "1"
 
 bot:
   logLevel: "debug"
@@ -104,7 +134,13 @@ bot:
     timeoutSeconds: 5
     successThreshold: 1
     failureThreshold: 3
-  resources: {}
+  resources:
+    requests:
+      memory: "2Gi"
+      cpu: "200m"
+    limits:
+      memory: "4Gi"
+      cpu: "1"
 
 ethereum:
   replicas: 1
@@ -126,11 +162,15 @@ ethereum:
     timeoutSeconds: 5
     successThreshold: 1
     failureThreshold: 3
-  resources: {}
+  resources:
+    requests:
+      memory: "2Gi"
+      cpu: "200m"
+    limits:
+      memory: "4Gi"
+      cpu: "1"
 
 metrics:
-  service:
-    type: ClusterIP
   ports:
     otlp: 4316
     metrics: 4315
@@ -143,6 +183,3 @@ metrics:
     timeoutSeconds: 5
     successThreshold: 1
     failureThreshold: 3
-
-test: "sample"
-scenario: "default"