Merge branch 'develop' into 3243-promtail-pipeline

raft-tech · Dec 10, 2024 · b3a5317 · b3a5317
2 parents a11b598 + 5d03c3c
commit b3a5317
Show file tree

Hide file tree

Showing 10 changed files with 153 additions and 45 deletions.
diff --git a/.gitconfig b/.gitconfig
@@ -13,6 +13,7 @@
 	allowed = .*DJANGO_SECRET_KEY=.*  #this is auto-generated in deployed environments
 	allowed = ./tdrs-backend/manifest.proxy.yml:*
 	allowed = ./tdrs-backend/plg/loki/manifest.yml:*
+	allowed = ./tdrs-backend/plg/deploy.sh:84
 	patterns = (A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}
 	patterns = (\"|')?(AWS|aws|Aws)?_?(SECRET|secret|Secret)?_?(ACCESS|access|Access)?_?(KEY|key|Key)(\"|')?\\s*(:|=>|=)\\s*(\"|')?[A-Za-z0-9/\\+=]{40}(\"|')?
 	patterns = (\"|')?(AWS|aws|Aws)?_?(ACCOUNT|account|Account)_?(ID|id|Id)?(\"|')?\\s*(:|=>|=)\\s*(\"|')?[0-9]{4}\\-?[0-9]{4}\\-?[0-9]{4}(\"|')?

diff --git a/tdrs-backend/plg/README.md b/tdrs-backend/plg/README.md
@@ -1,3 +1,31 @@
+# TDP PLG Stack
+Before attempting to deploy the PLG stack or an postgres exporter you MUST have access to the production space in cloud.gov.
+
+## Deploying PLG
+Before deploying the PLG stack you must have the `ADMIN_EMAILS` and `DEV_EMAILS` variables defined in your shell environment. The variables should be a comma separated string of emails, eg: `ADMIN_EMAILS="email1@email.com, email2@email.com, email3@email.com"` and `DEV_EMAILS="email4@email.com, email5@email.com, email6@email.com"`.
+
+Once both of the above items have been confirmed, you can target the production environment with the CF CLI and run the command below.
+
+```
+./deploy.sh -a -d tdp-db-prod
+```
+
+The command will deploy the entire PLG stack to the production environment and setup all appropriate network policies and routes.
+
+## Deploying a Postgres Exporter
+Before deploying a postgres exporter, you need to acquire the AWS RDS database URI for the RDS instance in the environment you are deploying the exporter to.
+
+```
+cf env <BACKEND_APP>
+```
+
+From the output of this command find the `VCAP_SERVICES` variable. Within this variable is a JSON list of services the app you provided is bound to. Find the `aws-rds` key and copy the `uri` value to your clipboard from the `credentials` key. Then you can deploy your exporter with the command below.
+
+```
+./deploy.sh -p <ENVIRONMENT_NAME> -d <RDS_SERVICE_NAME> -u <DATABASE_URI>
+```
+where `<ENVIRONMENT_NAME>` MUST be one of `[dev, staging, production]`, and `<DATABASE_URI>` is the uri you just copied from the app's `VCAP_SERVICES` environment variable. This command also handles all of the necessary networking configuration.
+
 ### Grafana Auth and RBAC Config
 Grafana is accessible by any frontend app on a private route to users who have the correct role. The Grafana UI is not be accessible to any user or application unless they are routed to it via a frontend app. Grafana is configured to require user and password authentication. Having the extra layer of authentication is required because the roles defined in Grafana are not in alignment with the roles TDP defines. Assigning users to appropriate role and teams in Grafana allows for least privilege access to any information that Grafana might be able to display.
 

diff --git a/tdrs-backend/plg/alertmanager/alertmanager.yml b/tdrs-backend/plg/alertmanager/alertmanager.yml
@@ -39,13 +39,22 @@ route:
 
   # The child route trees.
   routes:
-    # This routes performs a regular expression match on alert labels to
-    # catch alerts that are related to a list of services.
+    # Only alert dev team of uptime issues
     - matchers:
         - alertname=~"UpTime"
       receiver: dev-team-emails
       group_wait: 30m
 
+    # Send all severity CRITICAL/ERROR alerts to OFA admins and TDP Devs
+    - matchers:
+      - severity=~"ERROR|CRITICAL"
+      receiver: admin-team-emails
+      continue: true
+    - matchers:
+      - severity=~"ERROR|CRITICAL"
+      receiver: dev-team-emails
+      continue: true
+
 # Inhibition rules allow to mute a set of alerts given that another alert is
 # firing.
 # We use this to mute any warning-level notifications if the same alert is
@@ -65,7 +74,9 @@ receivers:
   - name: 'admin-team-emails'
     email_configs:
       - to: '{{ admin_team_emails }}'
+        send_resolved: true
 
   - name: 'dev-team-emails'
     email_configs:
       - to: '{{ dev_team_emails }}'
+        send_resolved: true
diff --git a/tdrs-backend/plg/alertmanager/manifest.yml b/tdrs-backend/plg/alertmanager/manifest.yml
@@ -1,10 +1,14 @@
 version: 1
 applications:
   - name: alertmanager
-    memory: 512M
-    disk_quota: 1G
+    memory: 128M
+    disk_quota: 5G
     instances: 1
     command: |
-      mkdir /tmp
+      mkdir data
+      wget https://github.com/prometheus/alertmanager/releases/download/v0.27.0/alertmanager-0.27.0.linux-amd64.tar.gz
+      tar -zxvf alertmanager-0.27.0.linux-amd64.tar.gz
+      rm -rf alertmanager-0.27.0.linux-amd64.tar.gz
+      ./alertmanager-0.27.0.linux-amd64/alertmanager --config.file=/home/vcap/app/alertmanager.prod.yml --web.listen-address=:8080 --storage.path=/home/vcap/app/data --log.level=debug --web.external-url=http://alertmanager.apps.internal:8080/alerts --web.route-prefix=/alerts --cluster.listen-address=""
     buildpacks:
     - https://github.com/cloudfoundry/binary-buildpack
diff --git a/tdrs-backend/plg/deploy.sh b/tdrs-backend/plg/deploy.sh
@@ -46,7 +46,6 @@ deploy_pg_exporter() {
 
 deploy_grafana() {
     pushd grafana
-    APP_NAME="grafana"
     DATASOURCES="datasources.yml"
     cp datasources.template.yml $DATASOURCES
     MANIFEST=manifest.tmp.yml
@@ -57,20 +56,7 @@ deploy_grafana() {
     yq eval -i ".applications[0].services[0] = \"$1\""  $MANIFEST
 
     cf push --no-route -f $MANIFEST -t 180  --strategy rolling
-    cf map-route $APP_NAME apps.internal --hostname $APP_NAME
-
-    # Add policy to allow grafana to talk to prometheus and loki
-    cf add-network-policy $APP_NAME prometheus --protocol tcp --port 8080
-    cf add-network-policy $APP_NAME loki --protocol tcp --port 8080
-
-    # Add network policies to allow grafana to talk to all frontend apps in all environments
-    for app in ${DEV_FRONTEND_APPS[@]}; do
-        cf add-network-policy "grafana" $app -s "tanf-dev" --protocol tcp --port 80
-    done
-    for app in ${STAGING_FRONTEND_APPS[@]}; do
-        cf add-network-policy "grafana" $app -s "tanf-staging" --protocol tcp --port 80
-    done
-    cf add-network-policy "grafana" $PROD_FRONTEND --protocol tcp --port 80
+    cf map-route grafana apps.internal --hostname grafana
 
     rm $DATASOURCES
     rm $MANIFEST
@@ -81,16 +67,6 @@ deploy_prometheus() {
     pushd prometheus
     cf push --no-route -f manifest.yml -t 180  --strategy rolling
     cf map-route prometheus apps.internal --hostname prometheus
-
-    # Add network policies to allow prometheus to talk to all backend apps in all environments
-    for app in ${DEV_BACKEND_APPS[@]}; do
-        cf add-network-policy prometheus $app -s "tanf-dev" --protocol tcp --port 8080
-    done
-    for app in ${STAGING_BACKEND_APPS[@]}; do
-        cf add-network-policy prometheus $app -s "tanf-staging" --protocol tcp --port 8080
-    done
-    cf add-network-policy prometheus $PROD_BACKEND --protocol tcp --port 8080
-
     popd
 }
 
@@ -101,23 +77,88 @@ deploy_loki() {
     popd
 }
 
-setup_extra_net_pols() {
-    # Add network policies to allow frontend/backend to talk to grafana/loki
+deploy_alertmanager() {
+    pushd alertmanager
+    CONFIG=alertmanager.prod.yml
+    cp alertmanager.yml $CONFIG
+    SENDGRID_API_KEY=$(cf env tdp-backend-prod | grep SENDGRID | cut -d " " -f2-)
+    yq eval -i ".global.smtp_auth_password = \"$SENDGRID_API_KEY\"" $CONFIG
+    yq eval -i ".receivers[0].email_configs[0].to = \"${ADMIN_EMAILS}\"" $CONFIG
+    yq eval -i ".receivers[1].email_configs[0].to = \"${DEV_EMAILS}\"" $CONFIG
+    cf push --no-route -f manifest.yml -t 180  --strategy rolling
+    cf map-route alertmanager apps.internal --hostname alertmanager
+    rm $CONFIG
+    popd
+}
+
+setup_prod_net_pols() {
+    # Target prod environment just in case
+    cf target -o hhs-acf-ofa -s tanf-prod
+
+    # Let grafana talk to prometheus and loki
+    cf add-network-policy grafana prometheus --protocol tcp --port 8080
+    cf add-network-policy grafana loki --protocol tcp --port 8080
+
+    # Let prometheus talk to alertmanager/grafana/loki/prod backend
+    cf add-network-policy prometheus alertmanager --protocol tcp --port 8080
+    cf add-network-policy prometheus $PROD_BACKEND --protocol tcp --port 8080
+    cf add-network-policy prometheus grafana --protocol tcp --port 8080
+    cf add-network-policy prometheus loki --protocol tcp --port 8080
+
+    # Let alertmanager/grafana talk to the prod frontend and vice versa
+    cf add-network-policy alertmanager $PROD_FRONTEND --protocol tcp --port 80
+    cf add-network-policy grafana $PROD_FRONTEND --protocol tcp --port 80
+    cf add-network-policy $PROD_FRONTEND alertmanager -s tanf-prod --protocol tcp --port 8080
+    cf add-network-policy $PROD_FRONTEND grafana -s tanf-prod --protocol tcp --port 8080
+
+    # Let prod backend send logs to loki
+    cf add-network-policy $PROD_BACKEND  loki -s tanf-prod --protocol tcp --port 8080
+
+    # Add network policies to allow alertmanager/grafana to talk to all frontend apps
+    for app in ${DEV_FRONTEND_APPS[@]}; do
+        cf add-network-policy alertmanager $app -s "tanf-dev" --protocol tcp --port 80
+        cf add-network-policy grafana $app -s tanf-dev --protocol tcp --port 80
+    done
+    for app in ${STAGING_FRONTEND_APPS[@]}; do
+        cf add-network-policy alertmanager $app -s "tanf-staging" --protocol tcp --port 80
+        cf add-network-policy grafana $app -s tanf-staging --protocol tcp --port 80
+    done
+
+    # Add network policies to allow prometheus to talk to all backend apps in all environments
+    for app in ${DEV_BACKEND_APPS[@]}; do
+        cf add-network-policy prometheus $app -s tanf-dev --protocol tcp --port 8080
+    done
+    for app in ${STAGING_BACKEND_APPS[@]}; do
+        cf add-network-policy prometheus $app -s tanf-staging --protocol tcp --port 8080
+    done
+}
+
+setup_dev_staging_net_pols() {
+    # Add network policies to handle routing traffic from lower envs to the prod env
     cf target -o hhs-acf-ofa -s tanf-dev
     for i in ${!DEV_BACKEND_APPS[@]}; do
         cf add-network-policy ${DEV_FRONTEND_APPS[$i]} grafana -s tanf-prod --protocol tcp --port 8080
         cf add-network-policy ${DEV_BACKEND_APPS[$i]} loki -s tanf-prod --protocol tcp --port 8080
+        cf add-network-policy ${DEV_FRONTEND_APPS[$i]} alertmanager -s tanf-prod --protocol tcp --port 8080
     done
 
     cf target -o hhs-acf-ofa -s tanf-staging
     for i in ${!STAGING_BACKEND_APPS[@]}; do
         cf add-network-policy ${STAGING_FRONTEND_APPS[$i]} grafana -s tanf-prod --protocol tcp --port 8080
         cf add-network-policy ${STAGING_BACKEND_APPS[$i]} loki -s tanf-prod --protocol tcp --port 8080
+        cf add-network-policy ${STAGING_FRONTEND_APPS[$i]} alertmanager -s tanf-prod --protocol tcp --port 8080
     done
-
     cf target -o hhs-acf-ofa -s tanf-prod
-    cf add-network-policy $PROD_FRONTEND grafana -s tanf-prod --protocol tcp --port 8080
-    cf add-network-policy $PROD_BACKEND  loki -s tanf-prod --protocol tcp --port 8080
+}
+
+check_email_vars() {
+    if [ "${ADMIN_EMAILS}" != "" ] && [ "${DEV_EMAILS}" != "" ]; then
+        echo "${ADMIN_EMAILS}"
+        echo "${DEV_EMAILS}"
+    else
+        echo "Missing definitions for ADMIN_EMAILS or DEV_EMAILS or both."
+        exit 1
+    fi
 }
 
 err_help_exit() {
@@ -128,6 +169,8 @@ err_help_exit() {
     exit
 }
 
+pushd "$(dirname "$0")"
+
 while getopts ":hap:u:d:" option; do
    case $option in
       h) # display Help
@@ -143,9 +186,7 @@ while getopts ":hap:u:d:" option; do
       d) # Bind a Postgres exporter or Grafana to $DB_SERVICE_NAME
          DB_SERVICE_NAME=$OPTARG;;
      \?) # Invalid option
-         echo "Error: Invalid option"
-         help
-         exit;;
+         err_help_exit "Error: Invalid option";;
    esac
 done
 
@@ -154,15 +195,18 @@ if [ "$#" -eq 0 ]; then
     exit
 fi
 
-pushd "$(dirname "$0")"
+check_email_vars
+
 if [ "$DB_SERVICE_NAME" == "" ]; then
     err_help_exit "Error: you must include a database service name."
 fi
 if [ "$DEPLOY" == "plg" ]; then
     deploy_prometheus
     deploy_loki
     deploy_grafana $DB_SERVICE_NAME
-    setup_extra_net_pols
+    deploy_alertmanager
+    setup_prod_net_pols
+    setup_dev_staging_net_pols
 fi
 if [ "$DEPLOY" == "pg-exporter" ]; then
     if [ "$DB_URI" == "" ]; then

diff --git a/tdrs-backend/plg/prometheus/alerts.local.yml b/tdrs-backend/plg/prometheus/alerts.local.yml
@@ -32,6 +32,7 @@ groups:
     rules:
     - alert: UpTime
       expr: avg_over_time(up[1m]) < 0.95
+      for: 30m
       labels:
         severity: WARNING
       annotations:

diff --git a/tdrs-backend/plg/prometheus/alerts.yml b/tdrs-backend/plg/prometheus/alerts.yml
@@ -32,7 +32,7 @@ groups:
           summary: "The {{ $labels.service }} service is down."
           description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 5 minutes."
     - alert: StagingBackendDown
-      expr: last_over_time(up{job=~"tdp-backend-staging""}[1m]) == 0
+      expr: last_over_time(up{job=~"tdp-backend-staging"}[1m]) == 0
       labels:
         severity: ERROR
       annotations:

diff --git a/tdrs-backend/plg/prometheus/manifest.yml b/tdrs-backend/plg/prometheus/manifest.yml
@@ -11,6 +11,6 @@ applications:
       mv ./prometheus-2.54.1.linux-amd64/prometheus ./prometheus &&
       mv ./prometheus-2.54.1.linux-amd64/promtool ./promtool &&
       rm -rf ./prometheus-2.54.1.linux-amd64 && rm -rf prometheus-2.54.1.linux-amd64.tar.gz &&
-      ./prometheus --config.file=/home/vcap/app/prometheus.yml --storage.tsdb.path=/home/vcap/app/prometheus-data --storage.tsdb.retention.time=30d --storage.tsdb.retention.size=6GB  --web.listen-address="0.0.0.0:8080"
+      ./prometheus --config.file=/home/vcap/app/prometheus.yml --storage.tsdb.path=/home/vcap/app/prometheus-data --storage.tsdb.retention.time=30d --storage.tsdb.retention.size=6GB  --web.listen-address="0.0.0.0:8080" --web.enable-lifecycle
     buildpacks:
     - https://github.com/cloudfoundry/binary-buildpack
diff --git a/tdrs-backend/plg/prometheus/prometheus.yml b/tdrs-backend/plg/prometheus/prometheus.yml
@@ -9,11 +9,12 @@ alerting:
     - path_prefix: /alerts
       static_configs:
         - targets:
-          # - alertmanager.apps.internal:8080
+          - alertmanager.apps.internal:8080
 
 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files:
   - "django-rules.yml"
+  - "alerts.yml"
 
 scrape_configs:
   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
@@ -84,15 +85,15 @@ scrape_configs:
 
   - job_name: loki
     static_configs:
-      - targets: ["loki.apps.internal:3100"]
+      - targets: ["loki.apps.internal:8080"]
         labels:
           service: "loki"
           env: "production"
 
   - job_name: grafana
     metrics_path: /grafana/metrics
     static_configs:
-      - targets: ["grafana.app.cloud.gov:9400"]
+      - targets: ["grafana.apps.internal:8080"]
         labels:
           service: "grafana"
           env: "production"
diff --git a/tdrs-frontend/nginx/cloud.gov/locations.conf b/tdrs-frontend/nginx/cloud.gov/locations.conf
@@ -78,6 +78,24 @@ location /grafana/ {
         proxy_buffer_size     4k;
 }
 
+location /alerts/ {
+        auth_request /plg_auth_check;
+        auth_request_set $auth_status $upstream_status;
+
+        set $alerts http://alertmanager.apps.internal:8080$request_uri;
+        proxy_pass $alerts;
+        proxy_set_header   Host $host:3000;
+        proxy_set_header   X-Real-IP         $remote_addr;
+        proxy_set_header   X-Forwarded-For   $proxy_add_x_forwarded_for;
+        proxy_set_header   X-Forwarded-Proto https;
+
+        proxy_connect_timeout 300;
+        proxy_read_timeout    300;
+        proxy_send_timeout    300;
+        send_timeout          900;
+        proxy_buffer_size     4k;
+}
+
 location = /plg_auth_check {
     internal;
     set $endpoint http://{{env "BACKEND_HOST"}}.apps.internal:8080/plg_auth_check/;