From 86888d4fbb819ddf5bff5ff59844fdac50b95368 Mon Sep 17 00:00:00 2001 From: Edward Welch Date: Tue, 5 Mar 2024 01:30:13 +0000 Subject: [PATCH] second pass at zone awareness adding rollout operator persisting tokens setting replica counts and max unavailable Signed-off-by: Edward Welch --- production/helm/loki/Chart.lock | 7 ++- production/helm/loki/Chart.yaml | 5 +++ .../templates/ingester/_helpers-ingester.tpl | 18 ++++++++ .../ingester/statefulset-ingester-zone-a.yaml | 12 +++-- .../ingester/statefulset-ingester-zone-b.yaml | 10 +++-- .../ingester/statefulset-ingester-zone-c.yaml | 10 +++-- production/helm/loki/values.yaml | 44 +++++++++++++++++-- 7 files changed, 91 insertions(+), 15 deletions(-) diff --git a/production/helm/loki/Chart.lock b/production/helm/loki/Chart.lock index 17f1dafad7ae..2cc237d73ef1 100644 --- a/production/helm/loki/Chart.lock +++ b/production/helm/loki/Chart.lock @@ -5,5 +5,8 @@ dependencies: - name: grafana-agent-operator repository: https://grafana.github.io/helm-charts version: 0.2.16 -digest: sha256:56eeb13a669bc816c1452cde5d6dddc61f6893f8aff3da1d2b56ce3bdcbcf84d -generated: "2023-11-09T12:22:25.317696-03:00" +- name: rollout-operator + repository: https://grafana.github.io/helm-charts + version: 0.13.0 +digest: sha256:ce0df9e286933f30653da8be12efea8e1549acdf10a527e459a2fa5ac3ef1636 +generated: "2024-03-04T14:50:50.223409936-05:00" diff --git a/production/helm/loki/Chart.yaml b/production/helm/loki/Chart.yaml index 8b5402bc9dbd..f08e27a2115d 100644 --- a/production/helm/loki/Chart.yaml +++ b/production/helm/loki/Chart.yaml @@ -21,6 +21,11 @@ dependencies: version: 0.2.16 repository: https://grafana.github.io/helm-charts condition: monitoring.selfMonitoring.grafanaAgent.installOperator + - name: rollout-operator + alias: rollout_operator + repository: https://grafana.github.io/helm-charts + version: 0.13.0 + condition: rollout_operator.enabled maintainers: - name: trevorwhitney - name: jeschkies diff --git a/production/helm/loki/templates/ingester/_helpers-ingester.tpl b/production/helm/loki/templates/ingester/_helpers-ingester.tpl index b3e3d2ae224a..418d4094d5ff 100644 --- a/production/helm/loki/templates/ingester/_helpers-ingester.tpl +++ b/production/helm/loki/templates/ingester/_helpers-ingester.tpl @@ -54,3 +54,21 @@ livenessProbe: {{- end }} {{- end }} {{- end -}} + +{{/* +expects global context +*/}} +{{- define "loki.ingester.replicaCount" -}} +{{- ceil (divf .Values.ingester.replicas 3) -}} +{{- end -}} + +{{/* +expects a dict +{ + "replicas": replicas in a zone, + "ctx": global context +} +*/}} +{{- define "loki.ingester.maxUnavailable" -}} +{{- ceil (mulf .replicas (divf (int .ctx.Values.ingester.zoneAwareReplication.maxUnavailablePct) 100)) -}} +{{- end -}} \ No newline at end of file diff --git a/production/helm/loki/templates/ingester/statefulset-ingester-zone-a.yaml b/production/helm/loki/templates/ingester/statefulset-ingester-zone-a.yaml index 7e686a9ce901..31ee8672369f 100644 --- a/production/helm/loki/templates/ingester/statefulset-ingester-zone-a.yaml +++ b/production/helm/loki/templates/ingester/statefulset-ingester-zone-a.yaml @@ -1,5 +1,6 @@ {{- $isDistributed := eq (include "loki.deployment.isDistributed" .) "true" -}} {{- if and $isDistributed .Values.ingester.zoneAwareReplication.enabled }} +{{- $replicas := (include "loki.ingester.replicaCount" .) -}} apiVersion: apps/v1 kind: StatefulSet metadata: @@ -10,13 +11,14 @@ metadata: app.kubernetes.io/part-of: memberlist rollout-group: ingester name: ingester-zone-a - {{- with .Values.loki.annotations }} annotations: + rollout-max-unavailable: "{{ include "loki.ingester.maxUnavailable" (dict "ctx" . "replicas" $replicas)}}" + {{- with .Values.loki.annotations }} {{- toYaml . | nindent 4 }} - {{- end }} + {{- end }} spec: {{- if not .Values.ingester.autoscaling.enabled }} - replicas: {{ .Values.ingester.replicas }} + replicas: {{ $replicas }} {{- end }} podManagementPolicy: Parallel serviceName: {{ include "loki.ingesterFullname" . }}-zone-a @@ -93,6 +95,8 @@ spec: args: - -config.file=/etc/loki/config/config.yaml - -ingester.availability-zone=zone-a + - -ingester.unregister-on-shutdown=false + - -ingester.tokens-file-path=/var/loki/ring-tokens - -target=ingester {{- with .Values.ingester.extraArgs }} {{- toYaml . | nindent 12 }} @@ -153,7 +157,7 @@ spec: operator: NotIn values: - ingester-zone-a - topologyKey: {{ .Values.ingester.zoneAwareReplication.topologyKey }} + topologyKey: kubernetes.io/hostname {{- with .Values.ingester.zoneAwareReplication.zoneA.extraAffinity }} {{- toYaml . | nindent 8 }} {{- end }} diff --git a/production/helm/loki/templates/ingester/statefulset-ingester-zone-b.yaml b/production/helm/loki/templates/ingester/statefulset-ingester-zone-b.yaml index f8d560e31954..8db430175428 100644 --- a/production/helm/loki/templates/ingester/statefulset-ingester-zone-b.yaml +++ b/production/helm/loki/templates/ingester/statefulset-ingester-zone-b.yaml @@ -1,5 +1,6 @@ {{- $isDistributed := eq (include "loki.deployment.isDistributed" .) "true" -}} {{- if and $isDistributed .Values.ingester.zoneAwareReplication.enabled }} +{{- $replicas := (include "loki.ingester.replicaCount" .) -}} apiVersion: apps/v1 kind: StatefulSet metadata: @@ -10,13 +11,14 @@ metadata: app.kubernetes.io/part-of: memberlist rollout-group: ingester name: ingester-zone-b - {{- with .Values.loki.annotations }} annotations: + rollout-max-unavailable: "{{ include "loki.ingester.maxUnavailable" (dict "ctx" . "replicas" $replicas)}}" + {{- with .Values.loki.annotations }} {{- toYaml . | nindent 4 }} - {{- end }} + {{- end }} spec: {{- if not .Values.ingester.autoscaling.enabled }} - replicas: {{ .Values.ingester.replicas }} + replicas: {{ $replicas }} {{- end }} podManagementPolicy: Parallel serviceName: {{ include "loki.ingesterFullname" . }}-zone-b @@ -93,6 +95,8 @@ spec: args: - -config.file=/etc/loki/config/config.yaml - -ingester.availability-zone=zone-b + - -ingester.unregister-on-shutdown=false + - -ingester.tokens-file-path=/var/loki/ring-tokens - -target=ingester {{- with .Values.ingester.extraArgs }} {{- toYaml . | nindent 12 }} diff --git a/production/helm/loki/templates/ingester/statefulset-ingester-zone-c.yaml b/production/helm/loki/templates/ingester/statefulset-ingester-zone-c.yaml index ca5fedfca9ee..1fd002db753e 100644 --- a/production/helm/loki/templates/ingester/statefulset-ingester-zone-c.yaml +++ b/production/helm/loki/templates/ingester/statefulset-ingester-zone-c.yaml @@ -1,5 +1,6 @@ {{- $isDistributed := eq (include "loki.deployment.isDistributed" .) "true" -}} {{- if and $isDistributed .Values.ingester.zoneAwareReplication.enabled }} +{{- $replicas := (include "loki.ingester.replicaCount" .) -}} apiVersion: apps/v1 kind: StatefulSet metadata: @@ -10,13 +11,14 @@ metadata: app.kubernetes.io/part-of: memberlist rollout-group: ingester name: ingester-zone-c - {{- with .Values.loki.annotations }} annotations: + rollout-max-unavailable: "{{ include "loki.ingester.maxUnavailable" (dict "ctx" . "replicas" $replicas)}}" + {{- with .Values.loki.annotations }} {{- toYaml . | nindent 4 }} - {{- end }} + {{- end }} spec: {{- if not .Values.ingester.autoscaling.enabled }} - replicas: {{ .Values.ingester.replicas }} + replicas: {{ $replicas }} {{- end }} podManagementPolicy: Parallel serviceName: {{ include "loki.ingesterFullname" . }}-zone-c @@ -93,6 +95,8 @@ spec: args: - -config.file=/etc/loki/config/config.yaml - -ingester.availability-zone=zone-c + - -ingester.unregister-on-shutdown=false + - -ingester.tokens-file-path=/var/loki/ring-tokens - -target=ingester {{- with .Values.ingester.extraArgs }} {{- toYaml . | nindent 12 }} diff --git a/production/helm/loki/values.yaml b/production/helm/loki/values.yaml index 843f538748fe..f6bd5df13d48 100644 --- a/production/helm/loki/values.yaml +++ b/production/helm/loki/values.yaml @@ -1442,7 +1442,8 @@ backend: # -- Configuration for the ingester ingester: - # -- Number of replicas for the ingester + # -- Number of replicas for the ingester, when zoneAwareReplication.enabled is true, the total + # number of replicas will match this value with each zone having 1/3rd of the total replicas. replicas: 0 # -- hostAliases to add hostAliases: [] @@ -1570,24 +1571,34 @@ ingester: # -- Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" grpc: "" # -- Enabling zone awareness on ingesters will create 3 statefulests where all writes will send a replica to each zone. - # This is primarily intended to accellerate rollout operations by allowing for multiple ingesters within a single + # This is primarily intended to accelerate rollout operations by allowing for multiple ingesters within a single # zone to be shutdown and restart simultaneously (the remaining 2 zones will be guaranteed to have at least one copy # of the data). # Note: This can be used to run Loki over multiple cloud provider availability zones however this is not currently # recommended as Loki is not optimized for this and cross zone network traffic costs can become extremely high # extremely quickly. Even with zone awareness enabled, it is recommended to run Loki in a single availability zone. zoneAwareReplication: + # -- Enable zone awareness. enabled: true - topologyKey: 'kubernetes.io/hostname' + # -- The percent of replicas in each zone that will be restarted at once. In a value of 0-100 + maxUnavailablePct: 33 + # -- zoneA configuration zoneA: + # -- optionally define a node selector for this zone nodeSelector: null + # -- optionally define extra affinity rules, by default different zones are not allowed to schedule on the same host extraAffinity: {} zoneB: + # -- optionally define a node selector for this zone nodeSelector: null + # -- optionally define extra affinity rules, by default different zones are not allowed to schedule on the same host extraAffinity: {} zoneC: + # -- optionally define a node selector for this zone nodeSelector: null + # -- optionally define extra affinity rules, by default different zones are not allowed to schedule on the same host extraAffinity: {} + # -- The migration block allows migrating non zone aware ingesters to zone aware ingesters. migration: enabled: false excludeDefaultZone: false @@ -2292,6 +2303,33 @@ ruler: # Subchart configurations # ###################################################################################################################### +# -- Setting for the Grafana Rollout Operator https://github.com/grafana/helm-charts/tree/main/charts/rollout-operator +rollout_operator: + enabled: true + + # -- podSecurityContext is the pod security context for the rollout operator. + # When installing on OpenShift, override podSecurityContext settings with + # + # rollout_operator: + # podSecurityContext: + # fsGroup: null + # runAsGroup: null + # runAsUser: null + podSecurityContext: + fsGroup: 10001 + runAsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + seccompProfile: + type: RuntimeDefault + + # Set the container security context + securityContext: + readOnlyRootFilesystem: true + capabilities: + drop: [ALL] + allowPrivilegeEscalation: false + # -- Configuration for the minio subchart minio: enabled: false