From 14eb6f32dfd8fc197ce750b531bc900efd094e3a Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Thu, 2 Dec 2021 00:35:20 +0000 Subject: [PATCH 1/5] Add dashboard monitoring resources Added a resources that creates a monitoring dashboard based on a json.tpl file. The dashboards can be customized with widgets that may eventually be outputs from other resources and can be setup to filter by deployment_name. Also includes two default dashboard JSON files: * Empty: empty with the exception of a text box * HPC: HPC focused dashboard --- resources/monitoring/dashboard/README.md | 37 ++ .../dashboard/dashboards/Empty.json.tpl | 17 + .../dashboard/dashboards/HPC.json.tpl | 595 ++++++++++++++++++ resources/monitoring/dashboard/main.tf | 13 + resources/monitoring/dashboard/module.json | 64 ++ resources/monitoring/dashboard/provider.tf | 19 + resources/monitoring/dashboard/variables.tf | 21 + resources/monitoring/dashboard/versions.tf | 26 + .../test_configs/dashboards.yaml | 58 ++ 9 files changed, 850 insertions(+) create mode 100644 resources/monitoring/dashboard/README.md create mode 100644 resources/monitoring/dashboard/dashboards/Empty.json.tpl create mode 100644 resources/monitoring/dashboard/dashboards/HPC.json.tpl create mode 100644 resources/monitoring/dashboard/main.tf create mode 100644 resources/monitoring/dashboard/module.json create mode 100644 resources/monitoring/dashboard/provider.tf create mode 100644 resources/monitoring/dashboard/variables.tf create mode 100644 resources/monitoring/dashboard/versions.tf create mode 100644 tools/test_examples/test_configs/dashboards.yaml diff --git a/resources/monitoring/dashboard/README.md b/resources/monitoring/dashboard/README.md new file mode 100644 index 0000000000..2b7abb1f5b --- /dev/null +++ b/resources/monitoring/dashboard/README.md @@ -0,0 +1,37 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 0.14.0 | +| [google](#requirement\_google) | ~> 3.0 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | ~> 3.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google_monitoring_dashboard.dashboard](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_dashboard) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [base\_dashboard](#input\_base\_dashboard) | Baseline dashboard template, either custom or from ./dashboards | `string` | `"HPC"` | no | +| [deployment\_name](#input\_deployment\_name) | The name of the current deployment | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | +| [widgets](#input\_widgets) | List of additional widgets to add to the base dashboard. | `list(string)` | `[]` | no | + +## Outputs + +No outputs. + \ No newline at end of file diff --git a/resources/monitoring/dashboard/dashboards/Empty.json.tpl b/resources/monitoring/dashboard/dashboards/Empty.json.tpl new file mode 100644 index 0000000000..347cb5be02 --- /dev/null +++ b/resources/monitoring/dashboard/dashboards/Empty.json.tpl @@ -0,0 +1,17 @@ +{ + "displayName": "HPC Toolkit: ${deployment_name}", + "gridLayout": { + "columns": 2, + "widgets": [ + { + "text": { + "content": "Metrics from the ${deployment_name} deployment of the HPC Toolkit.", + "format": "MARKDOWN" + }, + "title": "HPC Toolkit" + }%{ for widget in widgets ~}, + ${widget} + %{endfor ~} + ] + } +} diff --git a/resources/monitoring/dashboard/dashboards/HPC.json.tpl b/resources/monitoring/dashboard/dashboards/HPC.json.tpl new file mode 100644 index 0000000000..81d564790f --- /dev/null +++ b/resources/monitoring/dashboard/dashboards/HPC.json.tpl @@ -0,0 +1,595 @@ +{ + "category": "CUSTOM", + "displayName": "HPC: ${deployment_name}", + "gridLayout": { + "columns": 2, + "widgets": [ + { + "text": { + "content": "HPC metrics from the ${deployment_name} deployment of the HPC Toolkit.", + "format": "MARKDOWN" + }, + "title": "HPC Toolkit" + }, + { + "title": "VM Instance - Memory utilization", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_MEAN" + }, + "filter": "metric.type=\"agent.googleapis.com/memory/percent_used\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "GCE VM Instance - CPU Utilization", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_MEAN" + }, + "filter": "metric.type=\"compute.googleapis.com/instance/cpu/utilization\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"", + "pickTimeSeriesFilter": { + "direction": "TOP", + "numTimeSeries": 20, + "rankingMethod": "METHOD_MEAN" + } + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "GCE VM Instance - CPU utilization (agent)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_MEAN" + }, + "filter": "metric.type=\"agent.googleapis.com/cpu/utilization\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + }, + "unitOverride": "%" + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "GCE VM Instance - Disk read operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"compute.googleapis.com/instance/disk/read_ops_count\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "GCE VM Instance - Disk write operations", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"compute.googleapis.com/instance/disk/write_ops_count\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "GCE VM Instance - Disk Read Bytes", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"agent.googleapis.com/disk/read_bytes_count\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "GCE VM Instance - Disk Write Bytes", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"agent.googleapis.com/disk/write_bytes_count\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "Throttled read bytes", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"compute.googleapis.com/instance/disk/throttled_read_bytes_count\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "Throttled write bytes", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"compute.googleapis.com/instance/disk/throttled_write_bytes_count\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "GCE VM Instance - Received packets", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"compute.googleapis.com/instance/network/received_packets_count\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "VM Instance - Sent packets", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"compute.googleapis.com/instance/network/sent_packets_count\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "VM Instance - Received bytes", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"compute.googleapis.com/instance/network/received_bytes_count\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "GCE VM Instance - Sent bytes", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_MEAN", + "groupByFields": [ + "metric.label.\"instance_name\"", + "metric.label.\"loadbalanced\"", + "resource.label.\"project_id\"", + "resource.label.\"instance_id\"", + "resource.label.\"zone\"" + ], + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"compute.googleapis.com/instance/network/sent_bytes_count\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"", + "secondaryAggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_MEAN" + } + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "GCE VM Instance - Network Traffic Bytes (agent)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"agent.googleapis.com/interface/traffic\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "Network Packets (agent)", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_RATE" + }, + "filter": "metric.type=\"agent.googleapis.com/interface/packets\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "TCP connections", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_MEAN" + }, + "filter": "metric.type=\"agent.googleapis.com/network/tcp_connections\"" + }, + "unitOverride": "1" + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "VM Instance - CPU utilization for steal", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "STACKED_BAR", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_MAX" + }, + "filter": "metric.type=\"agent.googleapis.com/cpu/utilization\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\" metric.label.\"cpu_state\"=\"steal\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }, + { + "title": "VM Instance - CPU utilization [MEAN]", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "minAlignmentPeriod": "60s", + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "apiSource": "DEFAULT_CLOUD", + "timeSeriesFilter": { + "aggregation": { + "alignmentPeriod": "60s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_MEAN" + }, + "filter": "metric.type=\"compute.googleapis.com/instance/cpu/utilization\" resource.type=\"gce_instance\" metadata.user_labels.\"ghpc_deployment\"=\"${deployment_name}\"" + } + } + } + ], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + }%{ for widget in widgets ~}, + ${widget} + %{endfor ~} + ] + } +} diff --git a/resources/monitoring/dashboard/main.tf b/resources/monitoring/dashboard/main.tf new file mode 100644 index 0000000000..f3a763b7b2 --- /dev/null +++ b/resources/monitoring/dashboard/main.tf @@ -0,0 +1,13 @@ + +locals { + dash_path = "${path.module}/dashboards/${var.base_dashboard}.json.tpl" +} + +resource "google_monitoring_dashboard" "dashboard" { + dashboard_json = templatefile(local.dash_path, { + widgets = var.widgets + deployment_name = var.deployment_name + } + ) + project = var.project_id +} diff --git a/resources/monitoring/dashboard/module.json b/resources/monitoring/dashboard/module.json new file mode 100644 index 0000000000..165d7b2b7e --- /dev/null +++ b/resources/monitoring/dashboard/module.json @@ -0,0 +1,64 @@ +{ + "header": "", + "footer": "", + "inputs": [ + { + "name": "base_dashboard", + "type": "string", + "description": "Baseline dashboard template, either custom or from ./dashboards", + "default": "HPC", + "required": false + }, + { + "name": "deployment_name", + "type": "string", + "description": "The name of the current deployment", + "default": null, + "required": true + }, + { + "name": "project_id", + "type": "string", + "description": "Project in which the HPC deployment will be created", + "default": null, + "required": true + }, + { + "name": "widgets", + "type": "list(string)", + "description": "List of additional widgets to add to the base dashboard.", + "default": [], + "required": false + } + ], + "modules": [], + "outputs": [], + "providers": [ + { + "name": "google", + "alias": null, + "version": "~\u003e 3.0" + } + ], + "requirements": [ + { + "name": "terraform", + "version": "\u003e= 0.14.0" + }, + { + "name": "google", + "version": "~\u003e 3.0" + } + ], + "resources": [ + { + "type": "monitoring_dashboard", + "name": "dashboard", + "provider": "google", + "source": "hashicorp/google", + "mode": "managed", + "version": "latest", + "description": null + } + ] +} diff --git a/resources/monitoring/dashboard/provider.tf b/resources/monitoring/dashboard/provider.tf new file mode 100644 index 0000000000..b5a84b25d1 --- /dev/null +++ b/resources/monitoring/dashboard/provider.tf @@ -0,0 +1,19 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +provider "google" { + project = var.project_id +} diff --git a/resources/monitoring/dashboard/variables.tf b/resources/monitoring/dashboard/variables.tf new file mode 100644 index 0000000000..50e8505aee --- /dev/null +++ b/resources/monitoring/dashboard/variables.tf @@ -0,0 +1,21 @@ +variable "project_id" { + description = "Project in which the HPC deployment will be created" + type = string +} + +variable "deployment_name" { + description = "The name of the current deployment" + type = string +} + +variable "base_dashboard" { + description = "Baseline dashboard template, either custom or from ./dashboards" + type = string + default = "HPC" +} + +variable "widgets" { + description = "List of additional widgets to add to the base dashboard." + type = list(string) + default = [] +} diff --git a/resources/monitoring/dashboard/versions.tf b/resources/monitoring/dashboard/versions.tf new file mode 100644 index 0000000000..4b047db9fa --- /dev/null +++ b/resources/monitoring/dashboard/versions.tf @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "~> 3.0" + } + } + + required_version = ">= 0.14.0" +} diff --git a/tools/test_examples/test_configs/dashboards.yaml b/tools/test_examples/test_configs/dashboards.yaml new file mode 100644 index 0000000000..0f45084652 --- /dev/null +++ b/tools/test_examples/test_configs/dashboards.yaml @@ -0,0 +1,58 @@ + + + +blueprint_name: hpc-cluster-small + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: hpc-slurm-small + region: europe-west4 + zone: europe-west4-a + +resource_groups: +- group: primary + resources: + - source: resources/monitoring/dashboard + kind: terraform + id: hpc_dash + settings: + widgets: + - | + { + "text": { + "content": "Metrics from the ${deployment_name} deployment of the HPC Toolkit.", + "format": "MARKDOWN" + }, + "title": "HPC Toolkit - TEST" + } + - | + { + "text": { + "content": "Metrics from the ${deployment_name} deployment of the HPC Toolkit.", + "format": "MARKDOWN" + }, + "title": "HPC Toolkit - TEST 2" + } + - source: resources/monitoring/dashboard + kind: terraform + id: empty_dash + settings: + base_dashboard: Empty + widgets: + - | + { + "text": { + "content": "Metrics from the ${deployment_name} deployment of the HPC Toolkit.", + "format": "MARKDOWN" + }, + "title": "HPC Toolkit - TEST" + } + - | + { + "text": { + "content": "Metrics from the ${deployment_name} deployment of the HPC Toolkit.", + "format": "MARKDOWN" + }, + "title": "HPC Toolkit - TEST 2" + } + From 73e9ac9a34a9bb64aef80f98bca018aa87c31650 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Thu, 2 Dec 2021 00:47:51 +0000 Subject: [PATCH 2/5] Add license to monitoring resource --- resources/monitoring/dashboard/README.md | 14 ++++++++++++++ resources/monitoring/dashboard/main.tf | 16 ++++++++++++++++ resources/monitoring/dashboard/module.json | 2 +- resources/monitoring/dashboard/variables.tf | 16 ++++++++++++++++ tools/test_examples/test_configs/dashboards.yaml | 14 ++++++++++++++ 5 files changed, 61 insertions(+), 1 deletion(-) diff --git a/resources/monitoring/dashboard/README.md b/resources/monitoring/dashboard/README.md index 2b7abb1f5b..58639a37db 100644 --- a/resources/monitoring/dashboard/README.md +++ b/resources/monitoring/dashboard/README.md @@ -1,4 +1,18 @@ +Copyright 2021 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ## Requirements | Name | Version | diff --git a/resources/monitoring/dashboard/main.tf b/resources/monitoring/dashboard/main.tf index f3a763b7b2..eb79be1ea1 100644 --- a/resources/monitoring/dashboard/main.tf +++ b/resources/monitoring/dashboard/main.tf @@ -1,3 +1,19 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + locals { dash_path = "${path.module}/dashboards/${var.base_dashboard}.json.tpl" diff --git a/resources/monitoring/dashboard/module.json b/resources/monitoring/dashboard/module.json index 165d7b2b7e..f110ee8188 100644 --- a/resources/monitoring/dashboard/module.json +++ b/resources/monitoring/dashboard/module.json @@ -1,5 +1,5 @@ { - "header": "", + "header": "Copyright 2021 Google LLC\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License.", "footer": "", "inputs": [ { diff --git a/resources/monitoring/dashboard/variables.tf b/resources/monitoring/dashboard/variables.tf index 50e8505aee..cf984ecbe5 100644 --- a/resources/monitoring/dashboard/variables.tf +++ b/resources/monitoring/dashboard/variables.tf @@ -1,3 +1,19 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + variable "project_id" { description = "Project in which the HPC deployment will be created" type = string diff --git a/tools/test_examples/test_configs/dashboards.yaml b/tools/test_examples/test_configs/dashboards.yaml index 0f45084652..598c9b353f 100644 --- a/tools/test_examples/test_configs/dashboards.yaml +++ b/tools/test_examples/test_configs/dashboards.yaml @@ -1,3 +1,17 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + From 88e9903a866b9e2528611fdd215a404f3b135d0e Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Thu, 2 Dec 2021 00:53:41 +0000 Subject: [PATCH 3/5] Add description to monitoring dashboard --- resources/monitoring/dashboard/README.md | 27 +++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/resources/monitoring/dashboard/README.md b/resources/monitoring/dashboard/README.md index 58639a37db..35a9981c9e 100644 --- a/resources/monitoring/dashboard/README.md +++ b/resources/monitoring/dashboard/README.md @@ -1,3 +1,28 @@ +## Description +Create a monitoring dashboard for the HPC cluster distribution. The resources +includes a default HPC focused dashboard with the ability to add custom widgets +as well as the option to add an empty dashboard and add widgets as needed. + +## Example +``` +- source: resources/monitoring/dashboard + kind: terraform + id: hpc_dash + settings: + widgets: + - | + { + "text": { + "content": "## Header", + "format": "MARKDOWN" + }, + "title": "Custom Text Block Widget" + } +``` +This resource creates a dashboard based on the HPC dashboard (default) with an +extra text widget added as a multi-line string representing a JSON block. + +## License Copyright 2021 Google LLC @@ -48,4 +73,4 @@ No modules. ## Outputs No outputs. - \ No newline at end of file + From ae09bb4ccd891c221c57140ea0a1f7914576c1ad Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Thu, 2 Dec 2021 01:04:53 +0000 Subject: [PATCH 4/5] Add monitoring dashboard to resources README --- resources/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/resources/README.md b/resources/README.md index 56c1a4328d..9554277f17 100644 --- a/resources/README.md +++ b/resources/README.md @@ -35,6 +35,11 @@ more simple VM instances. Used when specifying a pre-existing file system to be mounted by simple_instances and slurm resources. +### Monitoring + +* [**dashboard**](monitoring/dashboard/README.md): Creates a +[monitoring dashboard](https://cloud.google.com/monitoring/dashboards) for +visually tracking a HPC Toolkit deployment. ### Network From 21368ff2568dd325c6b0d43b5bdb0f85ffd70829 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Thu, 2 Dec 2021 01:23:13 +0000 Subject: [PATCH 5/5] Clarify descriptions and add title variable --- resources/monitoring/dashboard/README.md | 5 +++-- resources/monitoring/dashboard/dashboards/Empty.json.tpl | 4 ++-- resources/monitoring/dashboard/dashboards/HPC.json.tpl | 4 ++-- resources/monitoring/dashboard/main.tf | 1 + resources/monitoring/dashboard/module.json | 9 ++++++++- resources/monitoring/dashboard/variables.tf | 8 +++++++- tools/test_examples/test_configs/dashboards.yaml | 4 ++-- 7 files changed, 25 insertions(+), 10 deletions(-) diff --git a/resources/monitoring/dashboard/README.md b/resources/monitoring/dashboard/README.md index 35a9981c9e..343c848e0c 100644 --- a/resources/monitoring/dashboard/README.md +++ b/resources/monitoring/dashboard/README.md @@ -1,5 +1,5 @@ ## Description -Create a monitoring dashboard for the HPC cluster distribution. The resources +Creates a monitoring dashboard for the HPC cluster distribution. The resource includes a default HPC focused dashboard with the ability to add custom widgets as well as the option to add an empty dashboard and add widgets as needed. @@ -65,9 +65,10 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [base\_dashboard](#input\_base\_dashboard) | Baseline dashboard template, either custom or from ./dashboards | `string` | `"HPC"` | no | +| [base\_dashboard](#input\_base\_dashboard) | Baseline dashboard template, select from HPC or Emtpy | `string` | `"HPC"` | no | | [deployment\_name](#input\_deployment\_name) | The name of the current deployment | `string` | n/a | yes | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | +| [title](#input\_title) | Title of the created dashboard | `string` | `"HPC Toolkit Dashboard"` | no | | [widgets](#input\_widgets) | List of additional widgets to add to the base dashboard. | `list(string)` | `[]` | no | ## Outputs diff --git a/resources/monitoring/dashboard/dashboards/Empty.json.tpl b/resources/monitoring/dashboard/dashboards/Empty.json.tpl index 347cb5be02..a341fd7693 100644 --- a/resources/monitoring/dashboard/dashboards/Empty.json.tpl +++ b/resources/monitoring/dashboard/dashboards/Empty.json.tpl @@ -1,5 +1,5 @@ { - "displayName": "HPC Toolkit: ${deployment_name}", + "displayName": "${title}: ${deployment_name}", "gridLayout": { "columns": 2, "widgets": [ @@ -8,7 +8,7 @@ "content": "Metrics from the ${deployment_name} deployment of the HPC Toolkit.", "format": "MARKDOWN" }, - "title": "HPC Toolkit" + "title": "${title}" }%{ for widget in widgets ~}, ${widget} %{endfor ~} diff --git a/resources/monitoring/dashboard/dashboards/HPC.json.tpl b/resources/monitoring/dashboard/dashboards/HPC.json.tpl index 81d564790f..9a9c668f50 100644 --- a/resources/monitoring/dashboard/dashboards/HPC.json.tpl +++ b/resources/monitoring/dashboard/dashboards/HPC.json.tpl @@ -1,6 +1,6 @@ { "category": "CUSTOM", - "displayName": "HPC: ${deployment_name}", + "displayName": "${title}: ${deployment_name}", "gridLayout": { "columns": 2, "widgets": [ @@ -9,7 +9,7 @@ "content": "HPC metrics from the ${deployment_name} deployment of the HPC Toolkit.", "format": "MARKDOWN" }, - "title": "HPC Toolkit" + "title": "${title}" }, { "title": "VM Instance - Memory utilization", diff --git a/resources/monitoring/dashboard/main.tf b/resources/monitoring/dashboard/main.tf index eb79be1ea1..9174b4bfa2 100644 --- a/resources/monitoring/dashboard/main.tf +++ b/resources/monitoring/dashboard/main.tf @@ -23,6 +23,7 @@ resource "google_monitoring_dashboard" "dashboard" { dashboard_json = templatefile(local.dash_path, { widgets = var.widgets deployment_name = var.deployment_name + title = var.title } ) project = var.project_id diff --git a/resources/monitoring/dashboard/module.json b/resources/monitoring/dashboard/module.json index f110ee8188..d3b368d2a3 100644 --- a/resources/monitoring/dashboard/module.json +++ b/resources/monitoring/dashboard/module.json @@ -5,7 +5,7 @@ { "name": "base_dashboard", "type": "string", - "description": "Baseline dashboard template, either custom or from ./dashboards", + "description": "Baseline dashboard template, select from HPC or Emtpy", "default": "HPC", "required": false }, @@ -23,6 +23,13 @@ "default": null, "required": true }, + { + "name": "title", + "type": "string", + "description": "Title of the created dashboard", + "default": "HPC Toolkit Dashboard", + "required": false + }, { "name": "widgets", "type": "list(string)", diff --git a/resources/monitoring/dashboard/variables.tf b/resources/monitoring/dashboard/variables.tf index cf984ecbe5..384e55a1ba 100644 --- a/resources/monitoring/dashboard/variables.tf +++ b/resources/monitoring/dashboard/variables.tf @@ -25,11 +25,17 @@ variable "deployment_name" { } variable "base_dashboard" { - description = "Baseline dashboard template, either custom or from ./dashboards" + description = "Baseline dashboard template, select from HPC or Emtpy" type = string default = "HPC" } +variable "title" { + description = "Title of the created dashboard" + type = string + default = "HPC Toolkit Dashboard" +} + variable "widgets" { description = "List of additional widgets to add to the base dashboard." type = list(string) diff --git a/tools/test_examples/test_configs/dashboards.yaml b/tools/test_examples/test_configs/dashboards.yaml index 598c9b353f..7593687cb3 100644 --- a/tools/test_examples/test_configs/dashboards.yaml +++ b/tools/test_examples/test_configs/dashboards.yaml @@ -15,11 +15,11 @@ -blueprint_name: hpc-cluster-small +blueprint_name: dashboards vars: project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm-small + deployment_name: dashboards-test region: europe-west4 zone: europe-west4-a