diff --git a/.gitconfig b/.gitconfig index b3cc6696c..d6c8c66b1 100644 --- a/.gitconfig +++ b/.gitconfig @@ -13,6 +13,7 @@ allowed = .*DJANGO_SECRET_KEY=.* #this is auto-generated in deployed environments allowed = ./tdrs-backend/manifest.proxy.yml:* allowed = ./tdrs-backend/plg/loki/manifest.yml:* + allowed = ./tdrs-backend/plg/deploy.sh:84 patterns = (A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16} patterns = (\"|')?(AWS|aws|Aws)?_?(SECRET|secret|Secret)?_?(ACCESS|access|Access)?_?(KEY|key|Key)(\"|')?\\s*(:|=>|=)\\s*(\"|')?[A-Za-z0-9/\\+=]{40}(\"|')? patterns = (\"|')?(AWS|aws|Aws)?_?(ACCOUNT|account|Account)_?(ID|id|Id)?(\"|')?\\s*(:|=>|=)\\s*(\"|')?[0-9]{4}\\-?[0-9]{4}\\-?[0-9]{4}(\"|')? diff --git a/tdrs-backend/plg/README.md b/tdrs-backend/plg/README.md index f0438e8f4..900a14b93 100644 --- a/tdrs-backend/plg/README.md +++ b/tdrs-backend/plg/README.md @@ -1,3 +1,31 @@ +# TDP PLG Stack +Before attempting to deploy the PLG stack or an postgres exporter you MUST have access to the production space in cloud.gov. + +## Deploying PLG +Before deploying the PLG stack you must have the `ADMIN_EMAILS` and `DEV_EMAILS` variables defined in your shell environment. The variables should be a comma separated string of emails, eg: `ADMIN_EMAILS="email1@email.com, email2@email.com, email3@email.com"` and `DEV_EMAILS="email4@email.com, email5@email.com, email6@email.com"`. + +Once both of the above items have been confirmed, you can target the production environment with the CF CLI and run the command below. + +``` +./deploy.sh -a -d tdp-db-prod +``` + +The command will deploy the entire PLG stack to the production environment and setup all appropriate network policies and routes. + +## Deploying a Postgres Exporter +Before deploying a postgres exporter, you need to acquire the AWS RDS database URI for the RDS instance in the environment you are deploying the exporter to. + +``` +cf env +``` + +From the output of this command find the `VCAP_SERVICES` variable. Within this variable is a JSON list of services the app you provided is bound to. Find the `aws-rds` key and copy the `uri` value to your clipboard from the `credentials` key. Then you can deploy your exporter with the command below. + +``` +./deploy.sh -p -d -u +``` +where `` MUST be one of `[dev, staging, production]`, and `` is the uri you just copied from the app's `VCAP_SERVICES` environment variable. This command also handles all of the necessary networking configuration. + ### Grafana Auth and RBAC Config Grafana is accessible by any frontend app on a private route to users who have the correct role. The Grafana UI is not be accessible to any user or application unless they are routed to it via a frontend app. Grafana is configured to require user and password authentication. Having the extra layer of authentication is required because the roles defined in Grafana are not in alignment with the roles TDP defines. Assigning users to appropriate role and teams in Grafana allows for least privilege access to any information that Grafana might be able to display. diff --git a/tdrs-backend/plg/alertmanager/alertmanager.yml b/tdrs-backend/plg/alertmanager/alertmanager.yml index 9414062ae..77c981eb8 100644 --- a/tdrs-backend/plg/alertmanager/alertmanager.yml +++ b/tdrs-backend/plg/alertmanager/alertmanager.yml @@ -39,13 +39,22 @@ route: # The child route trees. routes: - # This routes performs a regular expression match on alert labels to - # catch alerts that are related to a list of services. + # Only alert dev team of uptime issues - matchers: - alertname=~"UpTime" receiver: dev-team-emails group_wait: 30m + # Send all severity CRITICAL/ERROR alerts to OFA admins and TDP Devs + - matchers: + - severity=~"ERROR|CRITICAL" + receiver: admin-team-emails + continue: true + - matchers: + - severity=~"ERROR|CRITICAL" + receiver: dev-team-emails + continue: true + # Inhibition rules allow to mute a set of alerts given that another alert is # firing. # We use this to mute any warning-level notifications if the same alert is @@ -65,7 +74,9 @@ receivers: - name: 'admin-team-emails' email_configs: - to: '{{ admin_team_emails }}' + send_resolved: true - name: 'dev-team-emails' email_configs: - to: '{{ dev_team_emails }}' + send_resolved: true diff --git a/tdrs-backend/plg/alertmanager/manifest.yml b/tdrs-backend/plg/alertmanager/manifest.yml index 80067f717..b66a4758d 100644 --- a/tdrs-backend/plg/alertmanager/manifest.yml +++ b/tdrs-backend/plg/alertmanager/manifest.yml @@ -1,10 +1,14 @@ version: 1 applications: - name: alertmanager - memory: 512M - disk_quota: 1G + memory: 128M + disk_quota: 5G instances: 1 command: | - mkdir /tmp + mkdir data + wget https://github.com/prometheus/alertmanager/releases/download/v0.27.0/alertmanager-0.27.0.linux-amd64.tar.gz + tar -zxvf alertmanager-0.27.0.linux-amd64.tar.gz + rm -rf alertmanager-0.27.0.linux-amd64.tar.gz + ./alertmanager-0.27.0.linux-amd64/alertmanager --config.file=/home/vcap/app/alertmanager.prod.yml --web.listen-address=:8080 --storage.path=/home/vcap/app/data --log.level=debug --web.external-url=http://alertmanager.apps.internal:8080/alerts --web.route-prefix=/alerts --cluster.listen-address="" buildpacks: - https://github.com/cloudfoundry/binary-buildpack diff --git a/tdrs-backend/plg/deploy.sh b/tdrs-backend/plg/deploy.sh index c411f5457..1d5648312 100755 --- a/tdrs-backend/plg/deploy.sh +++ b/tdrs-backend/plg/deploy.sh @@ -46,7 +46,6 @@ deploy_pg_exporter() { deploy_grafana() { pushd grafana - APP_NAME="grafana" DATASOURCES="datasources.yml" cp datasources.template.yml $DATASOURCES MANIFEST=manifest.tmp.yml @@ -57,20 +56,7 @@ deploy_grafana() { yq eval -i ".applications[0].services[0] = \"$1\"" $MANIFEST cf push --no-route -f $MANIFEST -t 180 --strategy rolling - cf map-route $APP_NAME apps.internal --hostname $APP_NAME - - # Add policy to allow grafana to talk to prometheus and loki - cf add-network-policy $APP_NAME prometheus --protocol tcp --port 8080 - cf add-network-policy $APP_NAME loki --protocol tcp --port 8080 - - # Add network policies to allow grafana to talk to all frontend apps in all environments - for app in ${DEV_FRONTEND_APPS[@]}; do - cf add-network-policy "grafana" $app -s "tanf-dev" --protocol tcp --port 80 - done - for app in ${STAGING_FRONTEND_APPS[@]}; do - cf add-network-policy "grafana" $app -s "tanf-staging" --protocol tcp --port 80 - done - cf add-network-policy "grafana" $PROD_FRONTEND --protocol tcp --port 80 + cf map-route grafana apps.internal --hostname grafana rm $DATASOURCES rm $MANIFEST @@ -81,16 +67,6 @@ deploy_prometheus() { pushd prometheus cf push --no-route -f manifest.yml -t 180 --strategy rolling cf map-route prometheus apps.internal --hostname prometheus - - # Add network policies to allow prometheus to talk to all backend apps in all environments - for app in ${DEV_BACKEND_APPS[@]}; do - cf add-network-policy prometheus $app -s "tanf-dev" --protocol tcp --port 8080 - done - for app in ${STAGING_BACKEND_APPS[@]}; do - cf add-network-policy prometheus $app -s "tanf-staging" --protocol tcp --port 8080 - done - cf add-network-policy prometheus $PROD_BACKEND --protocol tcp --port 8080 - popd } @@ -101,23 +77,88 @@ deploy_loki() { popd } -setup_extra_net_pols() { - # Add network policies to allow frontend/backend to talk to grafana/loki +deploy_alertmanager() { + pushd alertmanager + CONFIG=alertmanager.prod.yml + cp alertmanager.yml $CONFIG + SENDGRID_API_KEY=$(cf env tdp-backend-prod | grep SENDGRID | cut -d " " -f2-) + yq eval -i ".global.smtp_auth_password = \"$SENDGRID_API_KEY\"" $CONFIG + yq eval -i ".receivers[0].email_configs[0].to = \"${ADMIN_EMAILS}\"" $CONFIG + yq eval -i ".receivers[1].email_configs[0].to = \"${DEV_EMAILS}\"" $CONFIG + cf push --no-route -f manifest.yml -t 180 --strategy rolling + cf map-route alertmanager apps.internal --hostname alertmanager + rm $CONFIG + popd +} + +setup_prod_net_pols() { + # Target prod environment just in case + cf target -o hhs-acf-ofa -s tanf-prod + + # Let grafana talk to prometheus and loki + cf add-network-policy grafana prometheus --protocol tcp --port 8080 + cf add-network-policy grafana loki --protocol tcp --port 8080 + + # Let prometheus talk to alertmanager/grafana/loki/prod backend + cf add-network-policy prometheus alertmanager --protocol tcp --port 8080 + cf add-network-policy prometheus $PROD_BACKEND --protocol tcp --port 8080 + cf add-network-policy prometheus grafana --protocol tcp --port 8080 + cf add-network-policy prometheus loki --protocol tcp --port 8080 + + # Let alertmanager/grafana talk to the prod frontend and vice versa + cf add-network-policy alertmanager $PROD_FRONTEND --protocol tcp --port 80 + cf add-network-policy grafana $PROD_FRONTEND --protocol tcp --port 80 + cf add-network-policy $PROD_FRONTEND alertmanager -s tanf-prod --protocol tcp --port 8080 + cf add-network-policy $PROD_FRONTEND grafana -s tanf-prod --protocol tcp --port 8080 + + # Let prod backend send logs to loki + cf add-network-policy $PROD_BACKEND loki -s tanf-prod --protocol tcp --port 8080 + + # Add network policies to allow alertmanager/grafana to talk to all frontend apps + for app in ${DEV_FRONTEND_APPS[@]}; do + cf add-network-policy alertmanager $app -s "tanf-dev" --protocol tcp --port 80 + cf add-network-policy grafana $app -s tanf-dev --protocol tcp --port 80 + done + for app in ${STAGING_FRONTEND_APPS[@]}; do + cf add-network-policy alertmanager $app -s "tanf-staging" --protocol tcp --port 80 + cf add-network-policy grafana $app -s tanf-staging --protocol tcp --port 80 + done + + # Add network policies to allow prometheus to talk to all backend apps in all environments + for app in ${DEV_BACKEND_APPS[@]}; do + cf add-network-policy prometheus $app -s tanf-dev --protocol tcp --port 8080 + done + for app in ${STAGING_BACKEND_APPS[@]}; do + cf add-network-policy prometheus $app -s tanf-staging --protocol tcp --port 8080 + done +} + +setup_dev_staging_net_pols() { + # Add network policies to handle routing traffic from lower envs to the prod env cf target -o hhs-acf-ofa -s tanf-dev for i in ${!DEV_BACKEND_APPS[@]}; do cf add-network-policy ${DEV_FRONTEND_APPS[$i]} grafana -s tanf-prod --protocol tcp --port 8080 cf add-network-policy ${DEV_BACKEND_APPS[$i]} loki -s tanf-prod --protocol tcp --port 8080 + cf add-network-policy ${DEV_FRONTEND_APPS[$i]} alertmanager -s tanf-prod --protocol tcp --port 8080 done cf target -o hhs-acf-ofa -s tanf-staging for i in ${!STAGING_BACKEND_APPS[@]}; do cf add-network-policy ${STAGING_FRONTEND_APPS[$i]} grafana -s tanf-prod --protocol tcp --port 8080 cf add-network-policy ${STAGING_BACKEND_APPS[$i]} loki -s tanf-prod --protocol tcp --port 8080 + cf add-network-policy ${STAGING_FRONTEND_APPS[$i]} alertmanager -s tanf-prod --protocol tcp --port 8080 done - cf target -o hhs-acf-ofa -s tanf-prod - cf add-network-policy $PROD_FRONTEND grafana -s tanf-prod --protocol tcp --port 8080 - cf add-network-policy $PROD_BACKEND loki -s tanf-prod --protocol tcp --port 8080 +} + +check_email_vars() { + if [ "${ADMIN_EMAILS}" != "" ] && [ "${DEV_EMAILS}" != "" ]; then + echo "${ADMIN_EMAILS}" + echo "${DEV_EMAILS}" + else + echo "Missing definitions for ADMIN_EMAILS or DEV_EMAILS or both." + exit 1 + fi } err_help_exit() { @@ -128,6 +169,8 @@ err_help_exit() { exit } +pushd "$(dirname "$0")" + while getopts ":hap:u:d:" option; do case $option in h) # display Help @@ -143,9 +186,7 @@ while getopts ":hap:u:d:" option; do d) # Bind a Postgres exporter or Grafana to $DB_SERVICE_NAME DB_SERVICE_NAME=$OPTARG;; \?) # Invalid option - echo "Error: Invalid option" - help - exit;; + err_help_exit "Error: Invalid option";; esac done @@ -154,7 +195,8 @@ if [ "$#" -eq 0 ]; then exit fi -pushd "$(dirname "$0")" +check_email_vars + if [ "$DB_SERVICE_NAME" == "" ]; then err_help_exit "Error: you must include a database service name." fi @@ -162,7 +204,9 @@ if [ "$DEPLOY" == "plg" ]; then deploy_prometheus deploy_loki deploy_grafana $DB_SERVICE_NAME - setup_extra_net_pols + deploy_alertmanager + setup_prod_net_pols + setup_dev_staging_net_pols fi if [ "$DEPLOY" == "pg-exporter" ]; then if [ "$DB_URI" == "" ]; then diff --git a/tdrs-backend/plg/prometheus/alerts.local.yml b/tdrs-backend/plg/prometheus/alerts.local.yml index 99183c544..a13cc7543 100644 --- a/tdrs-backend/plg/prometheus/alerts.local.yml +++ b/tdrs-backend/plg/prometheus/alerts.local.yml @@ -32,6 +32,7 @@ groups: rules: - alert: UpTime expr: avg_over_time(up[1m]) < 0.95 + for: 30m labels: severity: WARNING annotations: diff --git a/tdrs-backend/plg/prometheus/alerts.yml b/tdrs-backend/plg/prometheus/alerts.yml index affe54498..da22ba57f 100644 --- a/tdrs-backend/plg/prometheus/alerts.yml +++ b/tdrs-backend/plg/prometheus/alerts.yml @@ -32,7 +32,7 @@ groups: summary: "The {{ $labels.service }} service is down." description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 5 minutes." - alert: StagingBackendDown - expr: last_over_time(up{job=~"tdp-backend-staging""}[1m]) == 0 + expr: last_over_time(up{job=~"tdp-backend-staging"}[1m]) == 0 labels: severity: ERROR annotations: diff --git a/tdrs-backend/plg/prometheus/manifest.yml b/tdrs-backend/plg/prometheus/manifest.yml index da68fdbd3..d5c72d72f 100644 --- a/tdrs-backend/plg/prometheus/manifest.yml +++ b/tdrs-backend/plg/prometheus/manifest.yml @@ -11,6 +11,6 @@ applications: mv ./prometheus-2.54.1.linux-amd64/prometheus ./prometheus && mv ./prometheus-2.54.1.linux-amd64/promtool ./promtool && rm -rf ./prometheus-2.54.1.linux-amd64 && rm -rf prometheus-2.54.1.linux-amd64.tar.gz && - ./prometheus --config.file=/home/vcap/app/prometheus.yml --storage.tsdb.path=/home/vcap/app/prometheus-data --storage.tsdb.retention.time=30d --storage.tsdb.retention.size=6GB --web.listen-address="0.0.0.0:8080" + ./prometheus --config.file=/home/vcap/app/prometheus.yml --storage.tsdb.path=/home/vcap/app/prometheus-data --storage.tsdb.retention.time=30d --storage.tsdb.retention.size=6GB --web.listen-address="0.0.0.0:8080" --web.enable-lifecycle buildpacks: - https://github.com/cloudfoundry/binary-buildpack diff --git a/tdrs-backend/plg/prometheus/prometheus.yml b/tdrs-backend/plg/prometheus/prometheus.yml index 66e35c519..55241934b 100644 --- a/tdrs-backend/plg/prometheus/prometheus.yml +++ b/tdrs-backend/plg/prometheus/prometheus.yml @@ -9,11 +9,12 @@ alerting: - path_prefix: /alerts static_configs: - targets: - # - alertmanager.apps.internal:8080 + - alertmanager.apps.internal:8080 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - "django-rules.yml" + - "alerts.yml" scrape_configs: # The job name is added as a label `job=` to any timeseries scraped from this config. @@ -84,7 +85,7 @@ scrape_configs: - job_name: loki static_configs: - - targets: ["loki.apps.internal:3100"] + - targets: ["loki.apps.internal:8080"] labels: service: "loki" env: "production" @@ -92,7 +93,7 @@ scrape_configs: - job_name: grafana metrics_path: /grafana/metrics static_configs: - - targets: ["grafana.app.cloud.gov:9400"] + - targets: ["grafana.apps.internal:8080"] labels: service: "grafana" env: "production" diff --git a/tdrs-backend/tdpservice/parsers/schema_defs/ssp/m2.py b/tdrs-backend/tdpservice/parsers/schema_defs/ssp/m2.py index 20edf6fdb..29b6cb564 100644 --- a/tdrs-backend/tdpservice/parsers/schema_defs/ssp/m2.py +++ b/tdrs-backend/tdpservice/parsers/schema_defs/ssp/m2.py @@ -94,7 +94,7 @@ condition_field_name='FAMILY_AFFILIATION', condition_function=category3.isEqual(1), result_field_name='CITIZENSHIP_STATUS', - result_function=category3.isOneOf((1, 2)), + result_function=category3.isOneOf((1, 2, 3)), ), category3.ifThenAlso( condition_field_name='FAMILY_AFFILIATION', diff --git a/tdrs-backend/tdpservice/parsers/schema_defs/ssp/m3.py b/tdrs-backend/tdpservice/parsers/schema_defs/ssp/m3.py index 6f44c551e..64285ba6e 100644 --- a/tdrs-backend/tdpservice/parsers/schema_defs/ssp/m3.py +++ b/tdrs-backend/tdpservice/parsers/schema_defs/ssp/m3.py @@ -92,7 +92,7 @@ condition_field_name='FAMILY_AFFILIATION', condition_function=category3.isEqual(1), result_field_name='CITIZENSHIP_STATUS', - result_function=category3.isOneOf((1, 2)), + result_function=category3.isOneOf((1, 2, 3)), ), category3.ifThenAlso( condition_field_name='FAMILY_AFFILIATION', @@ -409,7 +409,7 @@ condition_field_name='FAMILY_AFFILIATION', condition_function=category3.isEqual(1), result_field_name='CITIZENSHIP_STATUS', - result_function=category3.isOneOf((1, 2)), + result_function=category3.isOneOf((1, 2, 3)), ), category3.ifThenAlso( condition_field_name='FAMILY_AFFILIATION', diff --git a/tdrs-backend/tdpservice/parsers/test/test_parse.py b/tdrs-backend/tdpservice/parsers/test/test_parse.py index d01a44030..1e9cd3840 100644 --- a/tdrs-backend/tdpservice/parsers/test/test_parse.py +++ b/tdrs-backend/tdpservice/parsers/test/test_parse.py @@ -498,7 +498,7 @@ def test_parse_ssp_section1_datafile(ssp_section1_datafile, dfs): assert cat4_errors[1].error_message == "Duplicate record detected with record type M3 at line 3273. " + \ "Record is a duplicate of the record at line number 3272." - assert parser_errors.count() == 32488 + assert parser_errors.count() == 32455 assert SSP_M1.objects.count() == expected_m1_record_count assert SSP_M2.objects.count() == expected_m2_record_count diff --git a/tdrs-frontend/nginx/cloud.gov/locations.conf b/tdrs-frontend/nginx/cloud.gov/locations.conf index 2e14fc69f..85f681543 100644 --- a/tdrs-frontend/nginx/cloud.gov/locations.conf +++ b/tdrs-frontend/nginx/cloud.gov/locations.conf @@ -78,6 +78,24 @@ location /grafana/ { proxy_buffer_size 4k; } +location /alerts/ { + auth_request /plg_auth_check; + auth_request_set $auth_status $upstream_status; + + set $alerts http://alertmanager.apps.internal:8080$request_uri; + proxy_pass $alerts; + proxy_set_header Host $host:3000; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto https; + + proxy_connect_timeout 300; + proxy_read_timeout 300; + proxy_send_timeout 300; + send_timeout 900; + proxy_buffer_size 4k; +} + location = /plg_auth_check { internal; set $endpoint http://{{env "BACKEND_HOST"}}.apps.internal:8080/plg_auth_check/; diff --git a/tdrs-frontend/src/components/Footer/Footer.jsx b/tdrs-frontend/src/components/Footer/Footer.jsx index 7b0eb0539..2b6e607dc 100644 --- a/tdrs-frontend/src/components/Footer/Footer.jsx +++ b/tdrs-frontend/src/components/Footer/Footer.jsx @@ -34,8 +34,6 @@ function Footer() { ) : null} - -