From 97ed683ea97c09f2e1ac1e99bf8c7f486d7818c9 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Fri, 17 Mar 2023 13:44:15 -0400 Subject: [PATCH 01/13] feat: Add pre-built grafana dashboard Signed-off-by: Rafael Vasquez --- config/grafana/ModelMeshMetricsDashboard.json | 2353 +++++++++++++++++ 1 file changed, 2353 insertions(+) create mode 100644 config/grafana/ModelMeshMetricsDashboard.json diff --git a/config/grafana/ModelMeshMetricsDashboard.json b/config/grafana/ModelMeshMetricsDashboard.json new file mode 100644 index 00000000..cd0c51d8 --- /dev/null +++ b/config/grafana/ModelMeshMetricsDashboard.json @@ -0,0 +1,2353 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 86400, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 6, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Age at Eviction" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2CC0C", + "mode": "fixed" + } + }, + { + "id": "custom.drawStyle", + "value": "points" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 10, + "x": 0, + "y": 1 + }, + "id": 44, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "max(modelmesh_instance_lru_age_seconds{namespace=\"$namespace\",pod=~\"$servicename-.*\"})", + "interval": "", + "legendFormat": "LRU Age", + "refId": "A" + }, + { + "exemplar": true, + "expr": "max(rate(modelmesh_age_at_eviction_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval])/rate(modelmesh_age_at_eviction_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/1000", + "hide": false, + "interval": "", + "legendFormat": "Age at Eviction", + "refId": "B" + } + ], + "title": "Global LRU age", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 16, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "deckbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 10, + "x": 10, + "y": 1 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum(modelmesh_instance_used_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\"}/1024)", + "interval": "", + "legendFormat": "Usage", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(modelmesh_instance_capacity_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\"}/1024)", + "hide": false, + "interval": "", + "legendFormat": "Total Cache Capacity", + "refId": "B" + } + ], + "title": "Cluster capacity and utilization", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 5, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 73, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "count(container_memory_usage_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\",container=\"$mm_container\"})", + "interval": "", + "legendFormat": "Count", + "refId": "A" + } + ], + "title": "Number of Pods", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 16, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "locale" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "models with load failure" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 45, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "max(modelmesh_models_managed_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"})", + "interval": "", + "legendFormat": "managed models", + "refId": "A" + }, + { + "exemplar": true, + "expr": "max(modelmesh_models_loaded_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"})", + "hide": false, + "interval": "", + "legendFormat": "loaded models", + "refId": "B" + }, + { + "exemplar": true, + "expr": "max(modelmesh_models_with_failure_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"})", + "hide": false, + "interval": "", + "legendFormat": "models with load failure", + "refId": "C" + } + ], + "title": "Model Counts", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 71, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(modelmesh_instance_models_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"}, \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")", + "interval": "", + "legendFormat": "{{short_podname}}", + "refId": "A" + } + ], + "title": "Model Counts per Pod", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Number of processed request per second", + "axisPlacement": "left", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 21, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.1.5", + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(modelmesh_api_request_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "External API", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(modelmesh_invoke_model_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "Internal API", + "refId": "B" + } + ], + "title": "Inference API Request Rate", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 65, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "avg(rate(modelmesh_request_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_request_size_bytes_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Request Size", + "refId": "A" + }, + { + "exemplar": true, + "expr": "avg(rate(modelmesh_response_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_response_size_bytes_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "Response Size", + "refId": "B" + } + ], + "title": "Average Inference Request and Response Sizes", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Number of processed request per second", + "axisPlacement": "left", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 21, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 62, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.1.5", + "targets": [ + { + "exemplar": true, + "expr": "label_replace(sum by (pod)(rate(modelmesh_api_request_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[1m])), \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")", + "hide": false, + "interval": "", + "legendFormat": "{{short_podname}}", + "refId": "A" + } + ], + "title": "External Inference API Request Rate Per Pod", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 57, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.1.5", + "targets": [ + { + "exemplar": true, + "expr": "label_replace(rate(modelmesh_api_request_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\",code=\"OK\"}[$__rate_interval])/rate(modelmesh_api_request_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\",code=\"OK\"}[$__rate_interval]), \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{short_podname}}", + "refId": "A" + } + ], + "title": "External Inference API Response Times by Pod (excluding errors)", + "transformations": [], + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Number of processed request per second", + "axisPlacement": "left", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 21, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 31 + }, + "id": 63, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.1.5", + "targets": [ + { + "exemplar": true, + "expr": "label_replace(rate(modelmesh_invoke_model_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]), \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")", + "hide": false, + "interval": "", + "legendFormat": "{{short_podname}}", + "refId": "A" + } + ], + "title": "Internal Inference API Requests Rate Per Pod", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 31 + }, + "id": 58, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "exemplar": true, + "expr": "label_replace(\n rate(modelmesh_invoke_model_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\",code=\"OK\"}[$__rate_interval]) /\n rate(modelmesh_invoke_model_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\",code=\"OK\"}[$__rate_interval]),\n \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")", + "interval": "", + "legendFormat": "{{short_podname}}", + "refId": "A" + } + ], + "title": "Internal Inference API Response Time By Pod (excluding errors)", + "transformations": [], + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 31 + }, + "id": 59, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.1.5", + "targets": [ + { + "exemplar": true, + "expr": "label_replace(rate(modelmesh_req_queue_delay_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m])/rate(modelmesh_req_queue_delay_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m]), \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")\n", + "interval": "", + "legendFormat": "{{short_podname}}", + "refId": "A" + } + ], + "title": "Inference API Queue Delay Time By Pod", + "transformations": [], + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Number of missed cache models", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Cache misses" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": ["Cache Misses"], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 40 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(modelmesh_cache_miss_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "interval": "10m", + "legendFormat": "Cache Misses", + "refId": "A" + } + ], + "title": "Cache Misses (per 10min)", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Cache misses as percentage of requests", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Cache misses" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 8, + "y": 40 + }, + "id": 74, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(modelmesh_cache_miss_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/sum(increase(modelmesh_api_request_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "hide": false, + "interval": "20m", + "legendFormat": "Cache Miss Rate", + "refId": "A" + } + ], + "title": "Cache Miss Rate", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Average request delay due to cache miss", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 17, + "y": 40 + }, + "id": 46, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "avg(rate(modelmesh_cache_miss_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m]))/avg(rate(modelmesh_cache_miss_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m]))", + "interval": "", + "legendFormat": "Cache miss delay", + "refId": "A" + } + ], + "title": "Cache Miss Delay (average)", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Number of model loading per second", + "axisPlacement": "auto", + "axisSoftMax": 5, + "axisSoftMin": -5, + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 29, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Model Unloads" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "semi-dark-yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Model Evictions" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "light-orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Load Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "light-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 9, + "x": 0, + "y": 48 + }, + "id": 22, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(modelmesh_loadmodel_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "format": "time_series", + "interval": "5m", + "intervalFactor": 1, + "legendFormat": "Model Loads", + "refId": "A" + }, + { + "exemplar": true, + "expr": "-sum(increase(modelmesh_unloadmodel_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "hide": false, + "interval": "5m", + "intervalFactor": 1, + "legendFormat": "Model Unloads", + "refId": "B" + }, + { + "exemplar": true, + "expr": "-sum(increase(modelmesh_age_at_eviction_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "hide": false, + "interval": "5m", + "intervalFactor": 1, + "legendFormat": "Model Evictions", + "refId": "C" + }, + { + "exemplar": true, + "expr": "sum(increase(modelmesh_loadmodel_failure{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "hide": false, + "interval": "5m", + "intervalFactor": 1, + "legendFormat": "Load Failures", + "refId": "D" + } + ], + "title": "Model Loads/Unloads (per 5min)", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 7, + "x": 9, + "y": 48 + }, + "id": 61, + "options": { + "graph": {}, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "7.5.11", + "targets": [ + { + "exemplar": true, + "expr": "avg(rate(modelmesh_loaded_model_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_loaded_model_size_bytes_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Loaded Model Size", + "refId": "A" + } + ], + "title": "Loaded Model Sizes", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 16, + "y": 48 + }, + "id": 47, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.1.5", + "targets": [ + { + "exemplar": true, + "expr": "avg(rate(modelmesh_loadmodel_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_loadmodel_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Loading Time", + "refId": "A" + }, + { + "exemplar": true, + "expr": "avg(rate(modelmesh_model_sizing_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_model_sizing_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "Sizing Time", + "refId": "B" + } + ], + "title": "Model Loading Times", + "transformations": [], + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 58 + }, + "id": 69, + "panels": [], + "title": "Resource Utilization", + "type": "row" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 5, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 2, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 3, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "core" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU Requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Allocation" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 59 + }, + "id": 48, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$servicename-.*\", container=\"$mm_container\"}[$__rate_interval]), \"short_podname\", \"$1\", \"pod\",\"$servicename-(.*)\")", + "interval": "", + "legendFormat": "{{short_podname}}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "avg(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\",pod=~\"$servicename-.*\",container=\"$mm_container\"})", + "hide": false, + "interval": "", + "legendFormat": "Allocation", + "refId": "B" + } + ], + "title": "ModelMesh Container CPU Usage", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Memory Requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 59 + }, + "id": 50, + "options": { + "graph": {}, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "7.5.11", + "targets": [ + { + "exemplar": true, + "expr": "label_replace(container_memory_usage_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\", container=\"$mm_container\"}/1024/1024,\"short_podname\", \"$1\", \"pod\",\"$servicename-(.*)\")", + "interval": "", + "legendFormat": "{{short_podname}}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "avg(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{namespace=\"$namespace\",container=\"$mm_container\",pod=~\"$servicename-.*\"}/1024/1024)", + "hide": false, + "interval": "", + "legendFormat": "Allocation", + "refId": "B" + } + ], + "title": "Model Mesh Container Memory Usage", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 2, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "core" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 70 + }, + "id": 33, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": false, + "expr": "rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$servicename-.*\",container!=\"$mm_container\",container!=\"\"}[$__rate_interval])", + "hide": false, + "interval": "", + "legendFormat": "{{container}}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\",pod=~\"$servicename-.*\",container!=\"$mm_container\",container!=\"\"}", + "hide": false, + "interval": "", + "legendFormat": "{{container}}-alloc", + "refId": "B" + } + ], + "title": "Serving Runtime Container CPU Usage", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Memory Requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 70 + }, + "id": 77, + "options": { + "graph": {}, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "7.5.11", + "targets": [ + { + "exemplar": true, + "expr": "label_replace(container_memory_usage_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\",container!=\"$mm_container\",container!=\"\"}/1024/1024,\"short_podname\", \"$1\", \"pod\",\"$servicename-(.*)\")", + "interval": "", + "legendFormat": "{{short_podname}}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "avg(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{namespace=\"$namespace\",container!=\"$mm_container\",container!=\"\",pod=~\"$servicename-.*\"}/1024/1024)", + "hide": false, + "interval": "", + "legendFormat": "Allocation", + "refId": "B" + } + ], + "title": "Serving Runtime Container Memory Usage", + "type": "timeseries" + } + ], + "refresh": false, + "schemaVersion": 35, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "my-namespace", + "value": "my-namespace" + }, + "hide": 0, + "name": "namespace", + "options": [ + { + "selected": true, + "text": "my-namespace", + "value": "my-namespace" + } + ], + "query": "my-namespace", + "skipUrlSync": false, + "type": "textbox" + }, + { + "current": { + "selected": false, + "text": "my-service-name", + "value": "my-service-name" + }, + "hide": 0, + "name": "servicename", + "options": [ + { + "selected": true, + "text": "my-service-name", + "value": "my-service-name" + } + ], + "query": "my-service-name", + "skipUrlSync": false, + "type": "textbox" + }, + { + "current": { + "selected": true, + "text": "mm-runtime", + "value": "mm-runtime" + }, + "hide": 0, + "includeAll": false, + "label": "MM Container Name", + "multi": false, + "name": "mm_container", + "options": [ + { + "selected": false, + "text": "mm", + "value": "mm" + }, + { + "selected": true, + "text": "mm-runtime", + "value": "mm-runtime" + }, + { + "selected": false, + "text": "modelmesh-runtime", + "value": "modelmesh-runtime" + } + ], + "query": "mm,mm-runtime,modelmesh-runtime", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "ModelMesh Metrics", + "uid": "vMm_rt-7z", + "version": 38, + "weekStart": "" +} \ No newline at end of file From 92f766307acf9bbb54171acc5f493f7b9e091037 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Fri, 17 Mar 2023 14:01:10 -0400 Subject: [PATCH 02/13] Updates doc link to dashboard json Signed-off-by: Rafael Vasquez --- docs/monitoring.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/monitoring.md b/docs/monitoring.md index c8c1518a..4936a858 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -28,7 +28,7 @@ If you have an alternative solution to collect the metrics, you can disable the We suggest using Grafana to visualize the Prometheus monitoring data. You can learn more about deploying/configuring both Prometheus and Grafana by checking out [this repo](https://github.com/prometheus-operator/kube-prometheus#quickstart). Also, check out [this page](https://github.com/kserve/modelmesh-performance/blob/main/docs/monitoring/README.md##Setup-Prometheus-Operator) for some tips on how to set it up. -When a Grafana instance is installed and running in the cluster, [this JSON file](https://github.com/kserve/modelmesh-performance/blob/main/docs/monitoring/modelmesh_grafana_dashboard_1634165844916.json) containing our Grafana Dashboard with ModelMesh metrics is suggested to view the metrics below. +When a Grafana instance is installed and running in the cluster, [this JSON file](/config/grafana/ModelMeshMetricsDashboard.json) containing our Grafana Dashboard with ModelMesh metrics is suggested to view the metrics below. ## Metrics From fae723a0e4ac130131efa1783e3483cd72a77b8a Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Fri, 17 Mar 2023 14:28:38 -0400 Subject: [PATCH 03/13] Lints Signed-off-by: Rafael Vasquez --- config/grafana/ModelMeshMetricsDashboard.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/grafana/ModelMeshMetricsDashboard.json b/config/grafana/ModelMeshMetricsDashboard.json index cd0c51d8..7c9f9525 100644 --- a/config/grafana/ModelMeshMetricsDashboard.json +++ b/config/grafana/ModelMeshMetricsDashboard.json @@ -2350,4 +2350,4 @@ "uid": "vMm_rt-7z", "version": 38, "weekStart": "" -} \ No newline at end of file +} From 3265601da6b8f5b3f2035a4049da2b5e713b9768 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Tue, 28 Mar 2023 15:06:06 -0400 Subject: [PATCH 04/13] Adds CPU usage per deployment Signed-off-by: Rafael Vasquez --- config/grafana/ModelMeshMetricsDashboard.json | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/config/grafana/ModelMeshMetricsDashboard.json b/config/grafana/ModelMeshMetricsDashboard.json index 7c9f9525..6cede1c4 100644 --- a/config/grafana/ModelMeshMetricsDashboard.json +++ b/config/grafana/ModelMeshMetricsDashboard.json @@ -2259,6 +2259,146 @@ ], "title": "Serving Runtime Container Memory Usage", "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 5, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 2, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 3, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "core" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU Requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Allocation" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 80 + }, + "id": 78, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum by (deployment) (label_replace(sum by (pod) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m])), \"deployment\", \"$2\", \"pod\", \"(^modelmesh-serving)-(.*)-(.*)-(.*)-(.*)\"))", + "interval": "", + "legendFormat": "{{short_podname}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\",pod=~\"$servicename-.*\",container=\"$mm_container\"})", + "hide": true, + "interval": "", + "legendFormat": "Allocation", + "range": true, + "refId": "B" + } + ], + "title": "ModelMesh Deployment CPU Usage", + "type": "timeseries" } ], "refresh": false, From 2e5342afff8bceea7a18bf1a7a0ed209a5c62452 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Tue, 28 Mar 2023 15:22:41 -0400 Subject: [PATCH 05/13] Update regex for deployment cpu usage Signed-off-by: Rafael Vasquez --- config/grafana/ModelMeshMetricsDashboard.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/grafana/ModelMeshMetricsDashboard.json b/config/grafana/ModelMeshMetricsDashboard.json index 6cede1c4..9ea674a1 100644 --- a/config/grafana/ModelMeshMetricsDashboard.json +++ b/config/grafana/ModelMeshMetricsDashboard.json @@ -2376,7 +2376,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum by (deployment) (label_replace(sum by (pod) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m])), \"deployment\", \"$2\", \"pod\", \"(^modelmesh-serving)-(.*)-(.*)-(.*)-(.*)\"))", + "expr": "sum by (deployment) (label_replace(sum by (pod) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m])), \"deployment\", \"$2\", \"pod\", \"(modelmesh-serving)-(.*)-(.*-.*-.*)", "interval": "", "legendFormat": "{{short_podname}}", "range": true, From e73b43c2361e680681d10034b702bf8d3f998d6f Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Fri, 31 Mar 2023 17:26:04 -0400 Subject: [PATCH 06/13] Adds deployment-view metrics Signed-off-by: Rafael Vasquez --- config/grafana/ModelMeshMetricsDashboard.json | 1959 ++++++++++++++--- 1 file changed, 1660 insertions(+), 299 deletions(-) diff --git a/config/grafana/ModelMeshMetricsDashboard.json b/config/grafana/ModelMeshMetricsDashboard.json index 9ea674a1..8ff798ac 100644 --- a/config/grafana/ModelMeshMetricsDashboard.json +++ b/config/grafana/ModelMeshMetricsDashboard.json @@ -1,19 +1,12 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], "annotations": { "list": [ { "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", @@ -29,19 +22,38 @@ ] }, "editable": true, + "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 43, "links": [], "liveNow": false, "panels": [ { - "datasource": "${DS_PROMETHEUS}", + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 81, + "panels": [], + "title": "Global Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMin": 86400, @@ -107,6 +119,28 @@ "value": "points" } ] + }, + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": ["Age at Eviction"], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] } ] }, @@ -116,12 +150,13 @@ "x": 0, "y": 1 }, - "id": 44, + "id": 82, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -130,13 +165,23 @@ }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "max(modelmesh_instance_lru_age_seconds{namespace=\"$namespace\",pod=~\"$servicename-.*\"})", "interval": "", "legendFormat": "LRU Age", + "range": true, "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "exemplar": true, "expr": "max(rate(modelmesh_age_at_eviction_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval])/rate(modelmesh_age_at_eviction_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/1000", "hide": false, @@ -149,7 +194,10 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "description": "", "fieldConfig": { "defaults": { @@ -157,12 +205,14 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 16, + "fillOpacity": 21, "gradientMode": "none", "hideFrom": { "graph": false, @@ -202,7 +252,98 @@ }, "unit": "deckbytes" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "triton capacity" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "mlserver capacity" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "triton usage" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "mlserver usage" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "total usage" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "total capacity" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 7, @@ -210,12 +351,13 @@ "x": 10, "y": 1 }, - "id": 16, + "id": 83, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -224,18 +366,30 @@ }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "sum(modelmesh_instance_used_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\"}/1024)", "interval": "", - "legendFormat": "Usage", + "legendFormat": "total usage", + "range": true, "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "sum(modelmesh_instance_capacity_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\"}/1024)", "hide": false, "interval": "", - "legendFormat": "Total Cache Capacity", + "legendFormat": "total capacity", + "range": true, "refId": "B" } ], @@ -243,7 +397,7 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": "P1809F7CD0C75ACF3", "description": "", "fieldConfig": { "defaults": { @@ -251,6 +405,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -307,7 +463,8 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -316,6 +473,25 @@ }, "targets": [ { + "datasource": { + "0": "P", + "1": "1", + "2": "8", + "3": "0", + "4": "9", + "5": "F", + "6": "7", + "7": "C", + "8": "D", + "9": "0", + "10": "C", + "11": "7", + "12": "5", + "13": "A", + "14": "C", + "15": "F", + "16": "3" + }, "exemplar": true, "expr": "count(container_memory_usage_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\",container=\"$mm_container\"})", "interval": "", @@ -327,7 +503,10 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "description": "", "fieldConfig": { "defaults": { @@ -335,12 +514,14 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 16, + "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "graph": false, @@ -371,10 +552,6 @@ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] }, @@ -391,7 +568,7 @@ "id": "color", "value": { "fixedColor": "red", - "mode": "fixed" + "mode": "palette-classic" } } ] @@ -404,12 +581,13 @@ "x": 0, "y": 8 }, - "id": 45, + "id": 84, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -418,26 +596,44 @@ }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "max(modelmesh_models_managed_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"})", "interval": "", - "legendFormat": "managed models", + "legendFormat": "Managed", + "range": true, "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "max(modelmesh_models_loaded_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"})", "hide": false, "interval": "", - "legendFormat": "loaded models", + "legendFormat": "Loaded", + "range": true, "refId": "B" }, { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "max(modelmesh_models_with_failure_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"})", "hide": false, "interval": "", - "legendFormat": "models with load failure", + "legendFormat": "Failed", + "range": true, "refId": "C" } ], @@ -445,7 +641,7 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": "P1809F7CD0C75ACF3", "description": "", "fieldConfig": { "defaults": { @@ -453,6 +649,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -495,7 +693,7 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, "y": 8 @@ -505,7 +703,8 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -514,6 +713,25 @@ }, "targets": [ { + "datasource": { + "0": "P", + "1": "1", + "2": "8", + "3": "0", + "4": "9", + "5": "F", + "6": "7", + "7": "C", + "8": "D", + "9": "0", + "10": "C", + "11": "7", + "12": "5", + "13": "A", + "14": "C", + "15": "F", + "16": "3" + }, "exemplar": true, "expr": "label_replace(modelmesh_instance_models_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"}, \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")", "interval": "", @@ -525,13 +743,18 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "Number of processed request per second", "axisPlacement": "left", "axisSoftMin": 0, @@ -585,12 +808,13 @@ "x": 0, "y": 15 }, - "id": 10, + "id": 86, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -600,19 +824,31 @@ "pluginVersion": "8.1.5", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "sum(rate(modelmesh_api_request_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "hide": false, "interval": "", "legendFormat": "External API", + "range": true, "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "sum(rate(modelmesh_invoke_model_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "hide": false, "interval": "", "legendFormat": "Internal API", + "range": true, "refId": "B" } ], @@ -620,13 +856,18 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -676,14 +917,15 @@ "h": 7, "w": 12, "x": 12, - "y": 16 + "y": 15 }, - "id": 65, + "id": 87, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -692,13 +934,23 @@ }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "avg(rate(modelmesh_request_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_request_size_bytes_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "interval": "", "legendFormat": "Request Size", + "range": true, "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "exemplar": true, "expr": "avg(rate(modelmesh_response_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_response_size_bytes_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "hide": false, @@ -711,13 +963,15 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": "P1809F7CD0C75ACF3", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "Number of processed request per second", "axisPlacement": "left", "axisSoftMin": 0, @@ -766,7 +1020,7 @@ "overrides": [] }, "gridPos": { - "h": 9, + "h": 8, "w": 12, "x": 0, "y": 22 @@ -776,7 +1030,8 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -786,6 +1041,25 @@ "pluginVersion": "8.1.5", "targets": [ { + "datasource": { + "0": "P", + "1": "1", + "2": "8", + "3": "0", + "4": "9", + "5": "F", + "6": "7", + "7": "C", + "8": "D", + "9": "0", + "10": "C", + "11": "7", + "12": "5", + "13": "A", + "14": "C", + "15": "F", + "16": "3" + }, "exemplar": true, "expr": "label_replace(sum by (pod)(rate(modelmesh_api_request_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[1m])), \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")", "hide": false, @@ -798,13 +1072,15 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": "P1809F7CD0C75ACF3", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -859,14 +1135,15 @@ "h": 8, "w": 12, "x": 12, - "y": 23 + "y": 22 }, "id": 57, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -876,6 +1153,25 @@ "pluginVersion": "8.1.5", "targets": [ { + "datasource": { + "0": "P", + "1": "1", + "2": "8", + "3": "0", + "4": "9", + "5": "F", + "6": "7", + "7": "C", + "8": "D", + "9": "0", + "10": "C", + "11": "7", + "12": "5", + "13": "A", + "14": "C", + "15": "F", + "16": "3" + }, "exemplar": true, "expr": "label_replace(rate(modelmesh_api_request_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\",code=\"OK\"}[$__rate_interval])/rate(modelmesh_api_request_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\",code=\"OK\"}[$__rate_interval]), \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")", "hide": false, @@ -890,13 +1186,15 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": "P1809F7CD0C75ACF3", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "Number of processed request per second", "axisPlacement": "left", "axisSoftMin": 0, @@ -948,14 +1246,15 @@ "h": 9, "w": 8, "x": 0, - "y": 31 + "y": 30 }, "id": 63, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -965,6 +1264,25 @@ "pluginVersion": "8.1.5", "targets": [ { + "datasource": { + "0": "P", + "1": "1", + "2": "8", + "3": "0", + "4": "9", + "5": "F", + "6": "7", + "7": "C", + "8": "D", + "9": "0", + "10": "C", + "11": "7", + "12": "5", + "13": "A", + "14": "C", + "15": "F", + "16": "3" + }, "exemplar": true, "expr": "label_replace(rate(modelmesh_invoke_model_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]), \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")", "hide": false, @@ -977,13 +1295,15 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": "P1809F7CD0C75ACF3", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1033,14 +1353,15 @@ "h": 9, "w": 8, "x": 8, - "y": 31 + "y": 30 }, "id": 58, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "multi", @@ -1050,7 +1371,26 @@ "pluginVersion": "8.4.7", "targets": [ { - "exemplar": true, + "datasource": { + "0": "P", + "1": "1", + "2": "8", + "3": "0", + "4": "9", + "5": "F", + "6": "7", + "7": "C", + "8": "D", + "9": "0", + "10": "C", + "11": "7", + "12": "5", + "13": "A", + "14": "C", + "15": "F", + "16": "3" + }, + "exemplar": true, "expr": "label_replace(\n rate(modelmesh_invoke_model_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\",code=\"OK\"}[$__rate_interval]) /\n rate(modelmesh_invoke_model_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\",code=\"OK\"}[$__rate_interval]),\n \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")", "interval": "", "legendFormat": "{{short_podname}}", @@ -1062,13 +1402,15 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": "P1809F7CD0C75ACF3", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1123,14 +1465,15 @@ "h": 9, "w": 8, "x": 16, - "y": 31 + "y": 30 }, "id": 59, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1140,6 +1483,25 @@ "pluginVersion": "8.1.5", "targets": [ { + "datasource": { + "0": "P", + "1": "1", + "2": "8", + "3": "0", + "4": "9", + "5": "F", + "6": "7", + "7": "C", + "8": "D", + "9": "0", + "10": "C", + "11": "7", + "12": "5", + "13": "A", + "14": "C", + "15": "F", + "16": "3" + }, "exemplar": true, "expr": "label_replace(rate(modelmesh_req_queue_delay_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m])/rate(modelmesh_req_queue_delay_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m]), \"short_podname\", \"$1\", \"pod\", \"$servicename-(.*)\")\n", "interval": "", @@ -1152,13 +1514,18 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "Number of missed cache models", "axisPlacement": "auto", "axisSoftMin": 0, @@ -1247,14 +1614,15 @@ "h": 8, "w": 8, "x": 0, - "y": 40 + "y": 39 }, "id": 8, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1263,10 +1631,16 @@ }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "sum(increase(modelmesh_cache_miss_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "interval": "10m", "legendFormat": "Cache Misses", + "range": true, "refId": "A" } ], @@ -1274,13 +1648,18 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "Cache misses as percentage of requests", "axisPlacement": "auto", "axisSoftMin": 0, @@ -1348,14 +1727,15 @@ "h": 8, "w": 9, "x": 8, - "y": 40 + "y": 39 }, "id": 74, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1364,11 +1744,17 @@ }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "sum(increase(modelmesh_cache_miss_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/sum(increase(modelmesh_api_request_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "hide": false, "interval": "20m", "legendFormat": "Cache Miss Rate", + "range": true, "refId": "A" } ], @@ -1376,13 +1762,18 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "Average request delay due to cache miss", "axisPlacement": "auto", "axisSoftMin": 0, @@ -1434,14 +1825,15 @@ "h": 8, "w": 7, "x": 17, - "y": 40 + "y": 39 }, "id": 46, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1450,6 +1842,10 @@ }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "exemplar": true, "expr": "avg(rate(modelmesh_cache_miss_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m]))/avg(rate(modelmesh_cache_miss_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m]))", "interval": "", @@ -1461,13 +1857,18 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "Number of model loading per second", "axisPlacement": "auto", "axisSoftMax": 5, @@ -1564,14 +1965,15 @@ "h": 10, "w": 9, "x": 0, - "y": 48 + "y": 47 }, - "id": 22, + "id": 88, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1580,39 +1982,63 @@ }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "sum(increase(modelmesh_loadmodel_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "format": "time_series", "interval": "5m", "intervalFactor": 1, "legendFormat": "Model Loads", + "range": true, "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "-sum(increase(modelmesh_unloadmodel_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "hide": false, "interval": "5m", "intervalFactor": 1, "legendFormat": "Model Unloads", + "range": true, "refId": "B" }, { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "-sum(increase(modelmesh_age_at_eviction_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "hide": false, "interval": "5m", "intervalFactor": 1, "legendFormat": "Model Evictions", + "range": true, "refId": "C" }, { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "sum(increase(modelmesh_loadmodel_failure{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "hide": false, "interval": "5m", "intervalFactor": 1, "legendFormat": "Load Failures", + "range": true, "refId": "D" } ], @@ -1620,13 +2046,18 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1677,15 +2108,16 @@ "h": 10, "w": 7, "x": 9, - "y": 48 + "y": 47 }, - "id": 61, + "id": 89, "options": { "graph": {}, "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1695,10 +2127,17 @@ "pluginVersion": "7.5.11", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "avg(rate(modelmesh_loaded_model_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_loaded_model_size_bytes_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "hide": false, "interval": "", "legendFormat": "Loaded Model Size", + "range": true, "refId": "A" } ], @@ -1706,13 +2145,18 @@ "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1768,14 +2212,15 @@ "h": 10, "w": 8, "x": 16, - "y": 48 + "y": 47 }, - "id": 47, + "id": 90, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1785,18 +2230,30 @@ "pluginVersion": "8.1.5", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "avg(rate(modelmesh_loadmodel_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_loadmodel_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "interval": "", "legendFormat": "Loading Time", + "range": true, "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, "expr": "avg(rate(modelmesh_model_sizing_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_model_sizing_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", "hide": false, "interval": "", "legendFormat": "Sizing Time", + "range": true, "refId": "B" } ], @@ -1805,31 +2262,38 @@ "type": "timeseries" }, { - "collapsed": true, + "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 58 + "y": 57 }, - "id": 69, + "id": 79, "panels": [], - "title": "Resource Utilization", + "title": "Deployment Metrics", "type": "row" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 5, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "graph": false, @@ -1837,9 +2301,9 @@ "tooltip": false, "viz": false }, - "lineInterpolation": "smooth", + "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 2, + "pointSize": 3, "scaleDistribution": { "type": "linear" }, @@ -1853,9 +2317,7 @@ "mode": "off" } }, - "decimals": 3, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1869,53 +2331,23 @@ } ] }, - "unit": "core" + "unit": "deckbytes" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "CPU Requests" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "super-light-blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Allocation" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "super-light-blue", - "mode": "fixed" - } - } - ] - } - ] + "overrides": [] }, "gridPos": { - "h": 11, - "w": 12, + "h": 7, + "w": 10, "x": 0, - "y": 59 + "y": 58 }, - "id": 48, + "id": 16, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1924,34 +2356,50 @@ }, "targets": [ { - "exemplar": true, - "expr": "label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$servicename-.*\", container=\"$mm_container\"}[$__rate_interval]), \"short_podname\", \"$1\", \"pod\",\"$servicename-(.*)\")", - "interval": "", - "legendFormat": "{{short_podname}}", - "refId": "A" + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_instance_used_bytes{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}/1024),\n \"deployment\",\n \"$2 usage\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "C" }, { - "exemplar": true, - "expr": "avg(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\",pod=~\"$servicename-.*\",container=\"$mm_container\"})", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_instance_capacity_bytes{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}/1024),\n \"deployment\",\n \"$2 capacity\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", "hide": false, - "interval": "", - "legendFormat": "Allocation", - "refId": "B" + "legendFormat": "__auto", + "range": true, + "refId": "D" } ], - "title": "ModelMesh Container CPU Usage", + "title": "Cluster capacity and utilization", "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, @@ -1964,12 +2412,12 @@ }, "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 5, + "pointSize": 3, "scaleDistribution": { "type": "linear" }, - "showPoints": "never", - "spanNulls": true, + "showPoints": "auto", + "spanNulls": false, "stacking": { "group": "A", "mode": "none" @@ -1979,34 +2427,29 @@ } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] }, - "unit": "decmbytes" + "unit": "locale" }, "overrides": [ { "matcher": { "id": "byName", - "options": "Memory Requests" + "options": "models with load failure" }, "properties": [ { "id": "color", "value": { - "fixedColor": "super-light-blue", - "mode": "fixed" + "fixedColor": "red", + "mode": "palette-classic" } } ] @@ -2014,68 +2457,97 @@ ] }, "gridPos": { - "h": 11, - "w": 12, - "x": 12, - "y": 59 + "h": 7, + "w": 10, + "x": 10, + "y": 58 }, - "id": 50, + "id": 45, "options": { - "graph": {}, "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, - "pluginVersion": "7.5.11", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, - "expr": "label_replace(container_memory_usage_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\", container=\"$mm_container\"}/1024/1024,\"short_podname\", \"$1\", \"pod\",\"$servicename-(.*)\")", + "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_models_with_failure_total{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}) ,\n \"deployment\",\n \"$2-failed models\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n ))", + "hide": false, "interval": "", - "legendFormat": "{{short_podname}}", + "legendFormat": "__auto", + "range": true, "refId": "A" }, { - "exemplar": true, - "expr": "avg(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{namespace=\"$namespace\",container=\"$mm_container\",pod=~\"$servicename-.*\"}/1024/1024)", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_instance_models_total{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}),\n \"deployment\",\n \"$2-loaded models\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", "hide": false, "interval": "", - "legendFormat": "Allocation", + "legendFormat": "__auto", + "range": true, "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": "max(modelmesh_models_managed_total{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"})", + "hide": false, + "interval": "", + "legendFormat": "total managed models", + "range": true, + "refId": "C" } ], - "title": "Model Mesh Container Memory Usage", + "title": "Model Counts", "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 2, + "fillOpacity": 5, "gradientMode": "none", "hideFrom": { - "graph": false, "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "smooth", + "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 2, + "pointSize": 5, "scaleDistribution": { "type": "linear" }, @@ -2089,7 +2561,6 @@ "mode": "off" } }, - "decimals": 2, "mappings": [], "min": 0, "thresholds": { @@ -2104,23 +2575,23 @@ "value": 80 } ] - }, - "unit": "core" + } }, "overrides": [] }, "gridPos": { - "h": 11, - "w": 12, - "x": 0, - "y": 70 + "h": 7, + "w": 4, + "x": 20, + "y": 58 }, - "id": 33, + "id": 91, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -2129,34 +2600,375 @@ }, "targets": [ { - "exemplar": false, - "expr": "rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$servicename-.*\",container!=\"$mm_container\",container!=\"\"}[$__rate_interval])", - "hide": false, + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum by (deployment) (\n label_replace(\n count by (pod) (container_memory_usage_bytes{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\",container=\"mm\"}),\n \"deployment\",\n \"$2\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)\n", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Number of Pods", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Number of processed request per second", + "axisPlacement": "left", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 21, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 92, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (rate(modelmesh_invoke_model_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 external request\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n ))", + "hide": false, "interval": "", - "legendFormat": "{{container}}", + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Inference API Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 65, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_request_size_bytes_sum{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 request size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_request_size_bytes_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 request size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_response_size_bytes_sum{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 response size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_response_size_bytes_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 response size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Average Inference Request and Response Sizes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Number of model loading per second", + "axisPlacement": "auto", + "axisSoftMax": 5, + "axisSoftMin": -5, + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 29, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 9, + "x": 0, + "y": 72 + }, + "id": 22, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": "sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_loadmodel_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 loads\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", + "hide": false, + "interval": "5m", + "legendFormat": "__auto", + "range": true, "refId": "A" }, { - "exemplar": true, - "expr": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\",pod=~\"$servicename-.*\",container!=\"$mm_container\",container!=\"\"}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": "-sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_unloadmodel_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 unloads\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", "hide": false, - "interval": "", - "legendFormat": "{{container}}-alloc", + "interval": "5m", + "legendFormat": "__auto", + "range": true, "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": "-sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_age_at_eviction_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 evictions\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", + "hide": false, + "interval": "5m", + "legendFormat": "__auto", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": "sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_loadmodel_failure{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 failures\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", + "hide": false, + "interval": "5m", + "legendFormat": "__auto", + "range": true, + "refId": "D" } ], - "title": "Serving Runtime Container CPU Usage", + "title": "Model Loads/Unloads (per 5min)", "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", - "description": "", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2186,7 +2998,6 @@ } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -2200,39 +3011,24 @@ } ] }, - "unit": "decmbytes" + "unit": "decbytes" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Memory Requests" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "super-light-blue", - "mode": "fixed" - } - } - ] - } - ] + "overrides": [] }, "gridPos": { - "h": 11, - "w": 12, - "x": 12, - "y": 70 + "h": 10, + "w": 7, + "x": 9, + "y": 72 }, - "id": 77, + "id": 61, "options": { "graph": {}, "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -2242,26 +3038,40 @@ "pluginVersion": "7.5.11", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, - "expr": "label_replace(container_memory_usage_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\",container!=\"$mm_container\",container!=\"\"}/1024/1024,\"short_podname\", \"$1\", \"pod\",\"$servicename-(.*)\")", + "expr": "avg(rate(modelmesh_loaded_model_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_loaded_model_size_bytes_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", + "hide": true, "interval": "", - "legendFormat": "{{short_podname}}", + "legendFormat": "Loaded Model Size", + "range": true, "refId": "A" }, { - "exemplar": true, - "expr": "avg(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{namespace=\"$namespace\",container!=\"$mm_container\",container!=\"\",pod=~\"$servicename-.*\"}/1024/1024)", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", + "expr": "avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loaded_model_size_bytes_sum{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 model size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)\n\n/\n\navg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loaded_model_size_bytes_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 model size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)\n\n", "hide": false, - "interval": "", - "legendFormat": "Allocation", + "legendFormat": "__auto", + "range": true, "refId": "B" } ], - "title": "Serving Runtime Container Memory Usage", + "title": "Loaded Model Sizes", "type": "timeseries" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "fieldConfig": { "defaults": { "color": { @@ -2274,7 +3084,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 5, + "fillOpacity": 8, "gradientMode": "none", "hideFrom": { "graph": false, @@ -2282,14 +3092,17 @@ "tooltip": false, "viz": false }, - "lineInterpolation": "smooth", + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, - "pointSize": 2, + "pointSize": 5, "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, + "showPoints": "never", + "spanNulls": true, "stacking": { "group": "A", "mode": "none" @@ -2298,7 +3111,7 @@ "mode": "off" } }, - "decimals": 3, + "decimals": 0, "mappings": [], "min": 0, "thresholds": { @@ -2314,48 +3127,17 @@ } ] }, - "unit": "core" + "unit": "ms" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "CPU Requests" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "super-light-blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Allocation" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "super-light-blue", - "mode": "fixed" - } - } - ] - } - ] + "overrides": [] }, "gridPos": { - "h": 11, - "w": 12, - "x": 0, - "y": 80 + "h": 10, + "w": 8, + "x": 16, + "y": 72 }, - "id": 78, + "id": 47, "options": { "legend": { "calcs": [], @@ -2368,6 +3150,7 @@ "sort": "none" } }, + "pluginVersion": "8.1.5", "targets": [ { "datasource": { @@ -2375,10 +3158,9 @@ "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", - "exemplar": true, - "expr": "sum by (deployment) (label_replace(sum by (pod) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m])), \"deployment\", \"$2\", \"pod\", \"(modelmesh-serving)-(.*)-(.*-.*-.*)", - "interval": "", - "legendFormat": "{{short_podname}}", + "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loadmodel_milliseconds_sum{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 loading time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loadmodel_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 loading time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", + "hide": false, + "legendFormat": "__auto", "range": true, "refId": "A" }, @@ -2388,21 +3170,600 @@ "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", - "exemplar": true, - "expr": "avg(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\",pod=~\"$servicename-.*\",container=\"$mm_container\"})", - "hide": true, - "interval": "", - "legendFormat": "Allocation", + "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_model_sizing_milliseconds_sum{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 sizing time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_model_sizing_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 sizing time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", + "hide": false, + "legendFormat": "__auto", "range": true, "refId": "B" } ], - "title": "ModelMesh Deployment CPU Usage", + "title": "Model Loading Times", + "transformations": [], "type": "timeseries" + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 82 + }, + "id": 69, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 5, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 2, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 3, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "core" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU Requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Allocation" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 83 + }, + "id": 48, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "exemplar": true, + "expr": "label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$servicename-.*\", container=\"$mm_container\"}[$__rate_interval]), \"short_podname\", \"$1\", \"pod\",\"$servicename-(.*)\")", + "interval": "", + "legendFormat": "{{short_podname}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "exemplar": true, + "expr": "avg(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\",pod=~\"$servicename-.*\",container=\"$mm_container\"})", + "hide": false, + "interval": "", + "legendFormat": "Allocation", + "refId": "B" + } + ], + "title": "ModelMesh Container CPU Usage", + "type": "timeseries" + }, + { + "datasource": "P1809F7CD0C75ACF3", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Memory Requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 83 + }, + "id": 50, + "options": { + "graph": {}, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "7.5.11", + "targets": [ + { + "datasource": { + "0": "P", + "1": "1", + "2": "8", + "3": "0", + "4": "9", + "5": "F", + "6": "7", + "7": "C", + "8": "D", + "9": "0", + "10": "C", + "11": "7", + "12": "5", + "13": "A", + "14": "C", + "15": "F", + "16": "3" + }, + "exemplar": true, + "expr": "label_replace(container_memory_usage_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\", container=\"$mm_container\"}/1024/1024,\"short_podname\", \"$1\", \"pod\",\"$servicename-(.*)\")", + "interval": "", + "legendFormat": "{{short_podname}}", + "refId": "A" + }, + { + "datasource": { + "0": "P", + "1": "1", + "2": "8", + "3": "0", + "4": "9", + "5": "F", + "6": "7", + "7": "C", + "8": "D", + "9": "0", + "10": "C", + "11": "7", + "12": "5", + "13": "A", + "14": "C", + "15": "F", + "16": "3" + }, + "exemplar": true, + "expr": "avg(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{namespace=\"$namespace\",container=\"$mm_container\",pod=~\"$servicename-.*\"}/1024/1024)", + "hide": false, + "interval": "", + "legendFormat": "Allocation", + "refId": "B" + } + ], + "title": "Model Mesh Container Memory Usage", + "type": "timeseries" + }, + { + "datasource": "P1809F7CD0C75ACF3", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 2, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "core" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 94 + }, + "id": 33, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "0": "P", + "1": "1", + "2": "8", + "3": "0", + "4": "9", + "5": "F", + "6": "7", + "7": "C", + "8": "D", + "9": "0", + "10": "C", + "11": "7", + "12": "5", + "13": "A", + "14": "C", + "15": "F", + "16": "3" + }, + "exemplar": false, + "expr": "rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$servicename-.*\",container!=\"$mm_container\",container!=\"\"}[$__rate_interval])", + "hide": false, + "interval": "", + "legendFormat": "{{container}}", + "refId": "A" + }, + { + "datasource": { + "0": "P", + "1": "1", + "2": "8", + "3": "0", + "4": "9", + "5": "F", + "6": "7", + "7": "C", + "8": "D", + "9": "0", + "10": "C", + "11": "7", + "12": "5", + "13": "A", + "14": "C", + "15": "F", + "16": "3" + }, + "exemplar": true, + "expr": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\",pod=~\"$servicename-.*\",container!=\"$mm_container\",container!=\"\"}", + "hide": false, + "interval": "", + "legendFormat": "{{container}}-alloc", + "refId": "B" + } + ], + "title": "Serving Runtime Container CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Memory Requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 94 + }, + "id": 77, + "options": { + "graph": {}, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "7.5.11", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "exemplar": true, + "expr": "label_replace(container_memory_usage_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\",container!=\"$mm_container\",container!=\"\"}/1024/1024,\"short_podname\", \"$1\", \"pod\",\"$servicename-(.*)\")", + "interval": "", + "legendFormat": "{{short_podname}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "exemplar": true, + "expr": "avg(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{namespace=\"$namespace\",container!=\"$mm_container\",container!=\"\",pod=~\"$servicename-.*\"}/1024/1024)", + "hide": false, + "interval": "", + "legendFormat": "Allocation", + "refId": "B" + } + ], + "title": "Serving Runtime Container Memory Usage", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Container Resource Utilization", + "type": "row" } ], "refresh": false, - "schemaVersion": 35, + "schemaVersion": 37, "style": "dark", "tags": [], "templating": { @@ -2410,46 +3771,46 @@ { "current": { "selected": false, - "text": "my-namespace", - "value": "my-namespace" + "text": "modelmesh-serving", + "value": "modelmesh-serving" }, "hide": 0, "name": "namespace", "options": [ { "selected": true, - "text": "my-namespace", - "value": "my-namespace" + "text": "modelmesh-serving", + "value": "modelmesh-serving" } ], - "query": "my-namespace", + "query": "modelmesh-serving", "skipUrlSync": false, "type": "textbox" }, { "current": { "selected": false, - "text": "my-service-name", - "value": "my-service-name" + "text": "modelmesh-serving", + "value": "modelmesh-serving" }, "hide": 0, "name": "servicename", "options": [ { "selected": true, - "text": "my-service-name", - "value": "my-service-name" + "text": "modelmesh-serving", + "value": "modelmesh-serving" } ], - "query": "my-service-name", + "query": "modelmesh-serving", "skipUrlSync": false, "type": "textbox" }, { "current": { "selected": true, - "text": "mm-runtime", - "value": "mm-runtime" + "text": "mm", + "value": "mm" }, "hide": 0, "includeAll": false, @@ -2458,12 +3819,12 @@ "name": "mm_container", "options": [ { - "selected": false, + "selected": true, "text": "mm", "value": "mm" }, { - "selected": true, + "selected": false, "text": "mm-runtime", "value": "mm-runtime" }, @@ -2481,13 +3842,13 @@ ] }, "time": { - "from": "now-2d", + "from": "now-6h", "to": "now" }, "timepicker": {}, "timezone": "browser", - "title": "ModelMesh Metrics", + "title": "ModelMesh Dashboard", "uid": "vMm_rt-7z", - "version": 38, + "version": 1, "weekStart": "" } From 6942214ede829e0beb0ba23c53c96db9260b8308 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 3 Apr 2023 14:42:35 -0400 Subject: [PATCH 07/13] Adds dropdown for runtime selection and variables Signed-off-by: Rafael Vasquez --- config/grafana/ModelMeshMetricsDashboard.json | 575 +++++++++++------- 1 file changed, 369 insertions(+), 206 deletions(-) diff --git a/config/grafana/ModelMeshMetricsDashboard.json b/config/grafana/ModelMeshMetricsDashboard.json index 8ff798ac..6f1e5636 100644 --- a/config/grafana/ModelMeshMetricsDashboard.json +++ b/config/grafana/ModelMeshMetricsDashboard.json @@ -1,4 +1,35 @@ { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.3.2" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -24,12 +55,12 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 43, + "id": null, "links": [], "liveNow": false, "panels": [ { - "collapsed": true, + "collapsed": false, "gridPos": { "h": 1, "w": 24, @@ -44,7 +75,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -119,28 +150,6 @@ "value": "points" } ] - }, - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": ["Age at Eviction"], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] } ] }, @@ -167,7 +176,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -180,13 +189,15 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, + "editorMode": "code", "exemplar": true, "expr": "max(rate(modelmesh_age_at_eviction_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval])/rate(modelmesh_age_at_eviction_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/1000", "hide": false, "interval": "", "legendFormat": "Age at Eviction", + "range": true, "refId": "B" } ], @@ -196,7 +207,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { @@ -368,7 +379,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -381,7 +392,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -397,7 +408,10 @@ "type": "timeseries" }, { - "datasource": "P1809F7CD0C75ACF3", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, "description": "", "fieldConfig": { "defaults": { @@ -505,7 +519,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { @@ -598,7 +612,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -611,7 +625,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -625,7 +639,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -641,7 +655,10 @@ "type": "timeseries" }, { - "datasource": "P1809F7CD0C75ACF3", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, "description": "", "fieldConfig": { "defaults": { @@ -745,7 +762,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -826,7 +843,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -840,7 +857,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -858,7 +875,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -936,7 +953,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -949,7 +966,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "avg(rate(modelmesh_response_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_response_size_bytes_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", @@ -963,7 +980,10 @@ "type": "timeseries" }, { - "datasource": "P1809F7CD0C75ACF3", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, "fieldConfig": { "defaults": { "color": { @@ -1006,8 +1026,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1072,7 +1091,10 @@ "type": "timeseries" }, { - "datasource": "P1809F7CD0C75ACF3", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, "fieldConfig": { "defaults": { "color": { @@ -1118,8 +1140,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1186,7 +1207,10 @@ "type": "timeseries" }, { - "datasource": "P1809F7CD0C75ACF3", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, "fieldConfig": { "defaults": { "color": { @@ -1229,8 +1253,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1295,7 +1318,10 @@ "type": "timeseries" }, { - "datasource": "P1809F7CD0C75ACF3", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, "fieldConfig": { "defaults": { "color": { @@ -1336,8 +1362,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1402,7 +1427,10 @@ "type": "timeseries" }, { - "datasource": "P1809F7CD0C75ACF3", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, "fieldConfig": { "defaults": { "color": { @@ -1448,8 +1476,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1516,7 +1543,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -1560,8 +1587,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1633,7 +1659,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -1650,7 +1676,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -1694,8 +1720,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1746,7 +1771,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -1764,7 +1789,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -1808,8 +1833,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1844,7 +1868,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "avg(rate(modelmesh_cache_miss_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m]))/avg(rate(modelmesh_cache_miss_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[5m]))", @@ -1859,7 +1883,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -1903,8 +1927,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1984,7 +2007,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -1999,7 +2022,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -2014,7 +2037,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -2029,7 +2052,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -2048,7 +2071,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -2091,8 +2114,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2129,7 +2151,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -2147,7 +2169,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -2195,8 +2217,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2232,7 +2253,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -2245,7 +2266,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, @@ -2277,7 +2298,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { @@ -2293,7 +2314,7 @@ "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 10, + "fillOpacity": 21, "gradientMode": "none", "hideFrom": { "graph": false, @@ -2322,8 +2343,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2333,7 +2353,98 @@ }, "unit": "deckbytes" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "triton capacity" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "mlserver capacity" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "triton usage" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "mlserver usage" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "total usage" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "total capacity" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 7, @@ -2358,10 +2469,10 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_instance_used_bytes{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}/1024),\n \"deployment\",\n \"$2 usage\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", + "expr": " sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_instance_used_bytes{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}/1024),\n \"deployment\",\n \"$2 usage\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", "hide": false, "legendFormat": "__auto", "range": true, @@ -2370,10 +2481,10 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_instance_capacity_bytes{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}/1024),\n \"deployment\",\n \"$2 capacity\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", + "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_instance_capacity_bytes{namespace=\"$namespace\",pod=~\"servicename-$runtime-.*\"}/1024),\n \"deployment\",\n \"$2 capacity\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", "hide": false, "legendFormat": "__auto", "range": true, @@ -2386,7 +2497,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { @@ -2431,8 +2542,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -2479,11 +2589,11 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, - "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_models_with_failure_total{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}) ,\n \"deployment\",\n \"$2-failed models\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n ))", + "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_models_with_failure_total{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}) ,\n \"deployment\",\n \"$2-failed models\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n ))", "hide": false, "interval": "", "legendFormat": "__auto", @@ -2493,28 +2603,15 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_instance_models_total{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}),\n \"deployment\",\n \"$2-loaded models\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", + "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (modelmesh_instance_models_total{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}),\n \"deployment\",\n \"$2-loaded models\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", "hide": false, "interval": "", "legendFormat": "__auto", "range": true, "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "editorMode": "code", - "expr": "max(modelmesh_models_managed_total{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"})", - "hide": false, - "interval": "", - "legendFormat": "total managed models", - "range": true, - "refId": "C" } ], "title": "Model Counts", @@ -2523,7 +2620,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { @@ -2567,8 +2664,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2602,11 +2698,11 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, - "expr": "sum by (deployment) (\n label_replace(\n count by (pod) (container_memory_usage_bytes{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\",container=\"mm\"}),\n \"deployment\",\n \"$2\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)\n", + "expr": "sum by (deployment) (\n label_replace(\n count by (pod) (container_memory_usage_bytes{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\",container=\"$mm_container\"}),\n \"deployment\",\n \"$2\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)\n", "interval": "", "legendFormat": "__auto", "range": true, @@ -2619,7 +2715,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -2663,8 +2759,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2700,11 +2795,11 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": true, - "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (rate(modelmesh_invoke_model_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 external request\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n ))", + "expr": "sum by (deployment) ( \n\tlabel_replace(\n sum by (pod) (rate(modelmesh_invoke_model_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 external request\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n ))", "hide": false, "interval": "", "legendFormat": "__auto", @@ -2718,7 +2813,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -2760,8 +2855,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2796,10 +2890,10 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_request_size_bytes_sum{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 request size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_request_size_bytes_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 request size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", + "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_request_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 request size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_request_size_bytes_count{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 request size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", "hide": false, "legendFormat": "__auto", "range": true, @@ -2808,10 +2902,10 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_response_size_bytes_sum{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 response size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_response_size_bytes_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 response size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", + "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_response_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 response size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_response_size_bytes_count{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 response size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", "hide": false, "legendFormat": "__auto", "range": true, @@ -2824,7 +2918,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -2868,8 +2962,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2878,7 +2971,53 @@ ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Model Unloads" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "semi-dark-yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Model Evictions" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "light-orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Load Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "light-red", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 10, @@ -2903,10 +3042,10 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_loadmodel_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 loads\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", + "expr": "sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_loadmodel_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 loads\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", "hide": false, "interval": "5m", "legendFormat": "__auto", @@ -2916,10 +3055,10 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "-sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_unloadmodel_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 unloads\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", + "expr": "-sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_unloadmodel_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 unloads\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", "hide": false, "interval": "5m", "legendFormat": "__auto", @@ -2929,10 +3068,10 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "-sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_age_at_eviction_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 evictions\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", + "expr": "-sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_age_at_eviction_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 evictions\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", "hide": false, "interval": "5m", "legendFormat": "__auto", @@ -2942,10 +3081,10 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_loadmodel_failure{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 failures\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", + "expr": "sum by (deployment) ( \n\tlabel_replace(\nsum by (pod) (increase(modelmesh_loadmodel_failure{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval])),\n \"deployment\",\n \"$2 failures\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)", "hide": false, "interval": "5m", "legendFormat": "__auto", @@ -2959,7 +3098,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -3002,8 +3141,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3040,24 +3178,10 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "editorMode": "code", - "exemplar": true, - "expr": "avg(rate(modelmesh_loaded_model_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))/avg(rate(modelmesh_loaded_model_size_bytes_count{namespace=\"$namespace\",pod=~\"$servicename-.*\"}[$__rate_interval]))", - "hide": true, - "interval": "", - "legendFormat": "Loaded Model Size", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loaded_model_size_bytes_sum{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 model size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)\n\n/\n\navg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loaded_model_size_bytes_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 model size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)\n\n", + "expr": "avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loaded_model_size_bytes_sum{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 model size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)\n\n/\n\navg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loaded_model_size_bytes_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 model size\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n)\n\n", "hide": false, "legendFormat": "__auto", "range": true, @@ -3070,7 +3194,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -3118,8 +3242,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3155,10 +3278,10 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loadmodel_milliseconds_sum{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 loading time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loadmodel_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 loading time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", + "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loadmodel_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 loading time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_loadmodel_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 loading time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", "hide": false, "legendFormat": "__auto", "range": true, @@ -3167,10 +3290,10 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_model_sizing_milliseconds_sum{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 sizing time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_model_sizing_milliseconds_count{namespace=\"modelmesh-serving\",pod=~\"modelmesh-serving-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 sizing time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", + "expr": " avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_model_sizing_milliseconds_sum{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 sizing time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )\n/\n avg by (deployment) (\n label_replace(\n avg by (pod) (\n rate(\n modelmesh_model_sizing_milliseconds_count{namespace=\"$namespace\",pod=~\"$servicename-$runtime-.*\"}[$__rate_interval]\n )\n ),\n \"deployment\",\n \"$2 sizing time\",\n \"pod\",\n \"(modelmesh-serving)-(.*?)-(.*)\"\n )\n )", "hide": false, "legendFormat": "__auto", "range": true, @@ -3198,7 +3321,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { @@ -3243,8 +3366,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3291,7 +3413,7 @@ "h": 11, "w": 12, "x": 0, - "y": 83 + "y": 27 }, "id": 48, "options": { @@ -3310,7 +3432,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod=~\"$servicename-.*\", container=\"$mm_container\"}[$__rate_interval]), \"short_podname\", \"$1\", \"pod\",\"$servicename-(.*)\")", @@ -3321,7 +3443,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "avg(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\",pod=~\"$servicename-.*\",container=\"$mm_container\"})", @@ -3335,7 +3457,10 @@ "type": "timeseries" }, { - "datasource": "P1809F7CD0C75ACF3", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, "fieldConfig": { "defaults": { "color": { @@ -3378,8 +3503,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3411,7 +3535,7 @@ "h": 11, "w": 12, "x": 12, - "y": 83 + "y": 27 }, "id": 50, "options": { @@ -3487,7 +3611,10 @@ "type": "timeseries" }, { - "datasource": "P1809F7CD0C75ACF3", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, "fieldConfig": { "defaults": { "color": { @@ -3531,8 +3658,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3548,7 +3674,7 @@ "h": 11, "w": 12, "x": 0, - "y": 94 + "y": 38 }, "id": 33, "options": { @@ -3625,7 +3751,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { @@ -3670,8 +3796,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3703,7 +3828,7 @@ "h": 11, "w": 12, "x": 12, - "y": 94 + "y": 38 }, "id": 77, "options": { @@ -3724,7 +3849,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "label_replace(container_memory_usage_bytes{namespace=\"$namespace\",pod=~\"$servicename-.*\",container!=\"$mm_container\",container!=\"\"}/1024/1024,\"short_podname\", \"$1\", \"pod\",\"$servicename-(.*)\")", @@ -3735,7 +3860,7 @@ { "datasource": { "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "avg(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{namespace=\"$namespace\",container!=\"$mm_container\",container!=\"\",pod=~\"$servicename-.*\"}/1024/1024)", @@ -3771,60 +3896,60 @@ { "current": { "selected": false, - "text": "modelmesh-serving", - "value": "modelmesh-serving" + "text": "my-namespace", + "value": "my-namespace" }, "hide": 0, "name": "namespace", "options": [ { "selected": true, - "text": "modelmesh-serving", - "value": "modelmesh-serving" + "text": "my-namespace", + "value": "my-namespace" } ], - "query": "modelmesh-serving", + "query": "my-namespace", "skipUrlSync": false, "type": "textbox" }, { "current": { "selected": false, - "text": "modelmesh-serving", - "value": "modelmesh-serving" + "text": "my-service-name", + "value": "my-service-name" }, "hide": 0, "name": "servicename", "options": [ { "selected": true, - "text": "modelmesh-serving", - "value": "modelmesh-serving" + "text": "my-service-name", + "value": "my-service-name" } ], - "query": "modelmesh-serving", + "query": "my-service-name", "skipUrlSync": false, "type": "textbox" }, { "current": { "selected": true, - "text": "mm", - "value": "mm" + "text": "mm-runtime", + "value": "mm-runtime" }, "hide": 0, "includeAll": false, - "label": "MM Container Name", + "label": "mm container", "multi": false, "name": "mm_container", "options": [ { - "selected": true, + "selected": false, "text": "mm", "value": "mm" }, { - "selected": false, + "selected": true, "text": "mm-runtime", "value": "mm-runtime" }, @@ -3838,17 +3963,55 @@ "queryValue": "", "skipUrlSync": false, "type": "custom" + }, + { + "current": { + "selected": true, + "text": ["mlserver"], + "value": ["mlserver"] + }, + "hide": 0, + "includeAll": false, + "label": "runtime", + "multi": true, + "name": "runtime", + "options": [ + { + "selected": true, + "text": "mlserver", + "value": "mlserver" + }, + { + "selected": false, + "text": "triton", + "value": "triton" + }, + { + "selected": false, + "text": "ovms", + "value": "ovms" + }, + { + "selected": false, + "text": "torchserve", + "value": "torchserve" + } + ], + "query": "mlserver,triton,ovms,torchserve", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" } ] }, "time": { - "from": "now-6h", + "from": "now-2d", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "ModelMesh Dashboard", - "uid": "vMm_rt-7z", + "uid": "vMm_rt-7z-new", "version": 1, "weekStart": "" } From 213f368f83517add791bf1fc19ffa96c64973f02 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 5 Apr 2023 13:16:22 -0400 Subject: [PATCH 08/13] Updates monitoring doc Signed-off-by: Rafael Vasquez --- docs/monitoring.md | 172 +++++++++++++++++++++++++++++++++------------ 1 file changed, 127 insertions(+), 45 deletions(-) diff --git a/docs/monitoring.md b/docs/monitoring.md index 4936a858..26597f83 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -1,36 +1,14 @@ -# Monitoring +~~# ModelMesh Metrics -ModelMesh Serving monitoring is designed to work with Prometheus and requires Prometheus and optionally Grafana to be available. +## Overview -The [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator) is recommended. Some instructions for setting these up for your cluster can be found in the ModelMesh Performance -repository located [here](https://github.com/kserve/modelmesh-performance/tree/main/docs/monitoring). To learn more about how the Prometheus Operator works, check out their -[Getting Started](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/user-guides/getting-started.md) guide. +Serving runtime pods expose the endpoint `/metrics` on port `2112` and scheme `https` and the metrics published by each pod are mostly disjoint, for aggregation by a monitoring framework (e.g. Grafana), except for [service-wide metrics](#Metrics) with the scope `Deployment`. These are published by only one of the pods at a given time. -Once Prometheus is configured to scrape the user projects, you can see the ModelMesh Serving metrics in Grafana UI. +Endpoints associated with a ModelMesh Serving service (`modelmesh-serving` by default) should be used to track the serving runtime pods' IPs from which Prometheus metrics should be scraped. -### `/metrics` endpoint +To override the default metrics port and scheme, you can set the variables `metrics.port` and `metrics.scheme` in the [`model-serving-config` configmap](configuration/README.md)). -Serving runtime pods exposes the endpoint `/metrics` on port `2112` and scheme `https` and the metrics published by each pod are mostly disjoint, for aggregation by the monitoring framework e.g. Grafana. Except for service-wide [metrics](#Metrics) with the scope "Deployment" that are published by only one of the pods at a given time. - -`Endpoint`s resource associated with ModelMesh Serving service(default name `modelmesh-serving` but could be different) should be used to track the serving runtime pod IPs from which Prometheus metrics should be scraped. - -To override the default metrics port and scheme, you need to add the configuration `metrics.port` and `metrics.scheme` in the main configmap (see [configuration](./configuration/README.md)). - -## Service Monitor - -A [ServiceMonitor](https://prometheus-operator.dev/docs/operator/design/#servicemonitor) CRD is provided by the Prometheus Operator and is leveraged by ModelMesh for monitoring pods -through the `modelmesh-serving` service. By default, when the ModelMesh controller is started, the existence of this `ServiceMonitor` CRD is checked. If available and `metrics.enabled` is `true`, a `ServiceMonitor` resource -will be created for monitoring `ServingRuntime` pods. - -If you have an alternative solution to collect the metrics, you can disable the creation of `ServiceMonitor` by adding the configuration `metrics.disablePrometheusOperatorSupport` set to `true` in the main configmap (see [configuration](./configuration/README.md)). - -## Grafana Dashboard - -We suggest using Grafana to visualize the Prometheus monitoring data. You can learn more about deploying/configuring both Prometheus and Grafana by checking out [this repo](https://github.com/prometheus-operator/kube-prometheus#quickstart). Also, check out [this page](https://github.com/kserve/modelmesh-performance/blob/main/docs/monitoring/README.md##Setup-Prometheus-Operator) for some tips on how to set it up. - -When a Grafana instance is installed and running in the cluster, [this JSON file](/config/grafana/ModelMeshMetricsDashboard.json) containing our Grafana Dashboard with ModelMesh metrics is suggested to view the metrics below. - -## Metrics +### Metrics ModelMesh Serving publishes a variety of metrics related to model request rates and timings, model loading/unloading rates, times and sizes, internal queuing delays, capacity/usage, cache state/LRU, etc. Each serving runtime pod exposes its own metrics that should be aggregated, except for service-wide metrics with the scope "Deployment" that are published by only one of the serving runtime pods at a given time. @@ -63,38 +41,142 @@ Here is the list of metrics exposed by ModelMesh Serving: | modelmesh_instance_used_bps | Gauge | Pod | Model capacity utilization in basis points (100ths of percent) | | modelmesh_instance_models_total | Gauge | Pod | Number of model copies loaded in pod | -Note that the request metrics include labels `method` and `code` with the method name and gRPC response code respectively. The code for successful requests is `OK`. +**Note**: The request metrics include labels `method` and `code` with the method name and gRPC response code respectively. The code for successful requests is `OK`. + +The best way to visualize the metrics is to use Prometheus to collect them from targets by scraping the metrics HTTP endpoints coupled with a Grafana dashboard. Setup instructions are provided below and involve the following steps: + +1. [Set up Prometheus Operator](#setup-prometheus-operator) +2. [Create the ServiceMonitor CRD](#create-the-servicemonitor-crd) +3. [Import Grafana Dashboard](#Import-Grafana-Dashboard) + +## Monitoring Setup + +### Set up Prometheus Operator +The [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator) is the easiest way to set up both Prometheus and Grafana natively in a Kubernetes cluster. You can clone the [`kube-prometheus`](https://github.com/prometheus-operator/kube-prometheus) project and follow the [quickstart](https://github.com/prometheus-operator/kube-prometheus#quickstart) instructions to set it up. +By default, the operator sets RBAC rules to enable monitoring for the `default`, `monitoring`, and `kube-system` namespaces to collect Kubernetes and node metrics. + +#### Monitor Additional Namespaces +To monitor the `modelmesh-serving` namespace, in the cloned `kube-prometheus` repository, add the following to `manifests/prometheus-roleBindingSpecificNamespaces.yaml`: + +```yaml +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.30.2 + name: prometheus-k8s + namespace: modelmesh-serving + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +``` + +and to `manifests/prometheus-roleSpecificNamespaces.yaml`: +```yaml +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.30.2 + name: prometheus-k8s + namespace: modelmesh-serving + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +``` + +#### Increase Retention Period +By default, Prometheus only keeps a 24-hour history record. To increase the retention period, modify `manifests/prometheus-prometheus.yaml` by adding: + +```yaml +spec: + ... + resources: + requests: + memory: 400Mi + # To change the retention period to 7 days, add the line below + retention: 7d + ... +``` + +Other configurable Prometheus specification fields are listed [here](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#prometheusspec). + + +## Create the ServiceMonitor CRD + +The [ServiceMonitor](https://prometheus-operator.dev/docs/operator/design/#servicemonitor) CRD is provided by the Prometheus Operator and is leveraged by ModelMesh for monitoring pods +through the `modelmesh-serving` service. By default, when the ModelMesh controller is started, the `ServiceMonitor` is checked. If it exists and `metrics.enabled` is `true`, a `ServiceMonitor` resource +will be created for monitoring `ServingRuntime` pods. + +If you have an alternative solution to collect the metrics, you can disable the creation of `ServiceMonitor` by setting the configuration `metrics.disablePrometheusOperatorSupport` to `true` in the [`model-serving-config` configmap](configuration/README.md)). + +## Import the Grafana Dashboard + +To access [Grafana](https://github.com/grafana/grafana) and visualize the Prometheus-monitored data, follow the instructions [here](https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/access-ui.md#grafana). + +We provide a [pre-built dashboard](/config/grafana/ModelMeshMetricsDashboard.json) which includes many important ModelMesh metrics and views. You can import it using the guide [here](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#import-a-dashboard). ## Troubleshooting If the ModelMesh Serving metric(s) are missing in the monitoring UIs: -- Check if the Serving Runtime pod is up and running +- Check if the Serving Runtime pod is up and running. -- Check if the following annotations are configured in the Serving Runtime deployment: + - Check if the annotations are configured in the Serving Runtime deployment: - kubectl describe deployment modelmesh-serving-mlserver-0.x -n $NAMESPACE | grep "prometheus.io" + kubectl describe deployment modelmesh-serving-mlserver-0.x -n $NAMESPACE | grep "prometheus.io" - Expected output: + Annotations: prometheus.io/path: /metrics + prometheus.io/port: 2112 + prometheus.io/scheme: https + prometheus.io/scrape: true - prometheus.io/path: /metrics - prometheus.io/port: 2112 - prometheus.io/scheme: https - prometheus.io/scrape: true + **Note:** The configured `metrics.port` must be listed in the annotation `prometheus.io/port`. The default port is `2112`. - **Note:** Configured `metrics.port` must be listed in the annotation "prometheus.io/port". Default port is 2112. - -- Check if the configured `metrics.port` is exposed through service `modelmesh-serving` +- Check if the configured `metrics.port` is exposed through the service `modelmesh-serving`: kubectl get svc modelmesh-serving -n $NAMESPACE - Expected output: - NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE modelmesh-serving ClusterIP .......... 8033/TCP,2112/TCP ... - **Note:** Configured `metrics.port` must be listed in the PORT(S). Default port is 2112. + **Note:** The configured `metrics.port` must be listed in the annotation `prometheus.io/port`. The default port is `2112`. -- Check if `ServiceMonitor` resource with name `modelmesh-metrics-monitor` is created +- Check if the `ServiceMonitor` resource with name `modelmesh-metrics-monitor` exists. -Additional troubleshooting steps can be found [here](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/troubleshooting.md#troubleshooting-servicemonitor-changes). +Additional troubleshooting steps can be found [here](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/troubleshooting.md#troubleshooting-servicemonitor-changes).~~ From d73db7ef871a6353727b14803363edc132da11b0 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 5 Apr 2023 13:17:14 -0400 Subject: [PATCH 09/13] Fixes title in doc Signed-off-by: Rafael Vasquez --- docs/monitoring.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/monitoring.md b/docs/monitoring.md index 26597f83..d0233b62 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -1,4 +1,4 @@ -~~# ModelMesh Metrics +# ModelMesh Metrics ## Overview From d5d9d5a3ee3fedd66def12181b14700f9d59177a Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 5 Apr 2023 13:27:55 -0400 Subject: [PATCH 10/13] Adds servicemonitor yaml and updates docs Signed-off-by: Rafael Vasquez --- config/grafana/servicemonitor.yaml | 20 ++++++++++++++++++++ docs/monitoring.md | 20 ++++++++++++-------- 2 files changed, 32 insertions(+), 8 deletions(-) create mode 100644 config/grafana/servicemonitor.yaml diff --git a/config/grafana/servicemonitor.yaml b/config/grafana/servicemonitor.yaml new file mode 100644 index 00000000..87c9e827 --- /dev/null +++ b/config/grafana/servicemonitor.yaml @@ -0,0 +1,20 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + modelmesh-service: modelmesh-serving + name: modelmesh-service-monitor + namespace: monitoring +spec: + endpoints: + - path: /metrics + port: "prometheus" + scheme: "https" + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + modelmesh-service: modelmesh-serving + namespaceSelector: + matchNames: + - modelmesh-serving diff --git a/docs/monitoring.md b/docs/monitoring.md index d0233b62..effdcaad 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -136,20 +136,24 @@ spec: Other configurable Prometheus specification fields are listed [here](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#prometheusspec). +## Create a ServiceMonitor -## Create the ServiceMonitor CRD +[ServiceMonitor](https://prometheus-operator.dev/docs/operator/design/#servicemonitor) is a custom resource definition provided by Prometheus Operator and is leveraged by ModelMesh for monitoring pods through the `modelmesh-serving` service. -The [ServiceMonitor](https://prometheus-operator.dev/docs/operator/design/#servicemonitor) CRD is provided by the Prometheus Operator and is leveraged by ModelMesh for monitoring pods -through the `modelmesh-serving` service. By default, when the ModelMesh controller is started, the `ServiceMonitor` is checked. If it exists and `metrics.enabled` is `true`, a `ServiceMonitor` resource +Create a `ServiceMonitor` to monitor the `modelmesh-serving` service using the definition found [here](../config/grafana/servicemonitor.yaml). +```bash +kubectl apply -f servicemonitor.yaml +``` +After the `ServiceMonitor` is created, the Prometheus operator will dynamically discover the pods with the label `modelmesh-service: modelmesh-serving` and scrape the metrics endpoint exposed by those pods. + +**Note**: By default, when the ModelMesh controller is started, the `ServiceMonitor` is checked. If it exists and `metrics.enabled` is `true`, a `ServiceMonitor` resource will be created for monitoring `ServingRuntime` pods. -If you have an alternative solution to collect the metrics, you can disable the creation of `ServiceMonitor` by setting the configuration `metrics.disablePrometheusOperatorSupport` to `true` in the [`model-serving-config` configmap](configuration/README.md)). +If you have an alternative solution to collect the metrics, you can disable the creation of `ServiceMonitor` by setting the configuration `metrics.disablePrometheusOperatorSupport` to `true` in the [`model-serving-config` configmap](configuration/README.md). ## Import the Grafana Dashboard -To access [Grafana](https://github.com/grafana/grafana) and visualize the Prometheus-monitored data, follow the instructions [here](https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/access-ui.md#grafana). - -We provide a [pre-built dashboard](/config/grafana/ModelMeshMetricsDashboard.json) which includes many important ModelMesh metrics and views. You can import it using the guide [here](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#import-a-dashboard). +To access [Grafana](https://github.com/grafana/grafana) and visualize the Prometheus-monitored data, follow the instructions [here](https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/access-ui.md#grafana) and import the [pre-built dashboard](/config/grafana/ModelMeshMetricsDashboard.json) we provide using [this guide](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#import-a-dashboard). ## Troubleshooting @@ -179,4 +183,4 @@ If the ModelMesh Serving metric(s) are missing in the monitoring UIs: - Check if the `ServiceMonitor` resource with name `modelmesh-metrics-monitor` exists. -Additional troubleshooting steps can be found [here](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/troubleshooting.md#troubleshooting-servicemonitor-changes).~~ +Additional troubleshooting steps can be found [here](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/troubleshooting.md#troubleshooting-servicemonitor-changes). From fa8af662b293d8301d75ab65c50ec00669ac67ea Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 5 Apr 2023 13:38:03 -0400 Subject: [PATCH 11/13] Lints doc Signed-off-by: Rafael Vasquez --- docs/monitoring.md | 70 +++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/docs/monitoring.md b/docs/monitoring.md index effdcaad..f19eae77 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -2,7 +2,7 @@ ## Overview -Serving runtime pods expose the endpoint `/metrics` on port `2112` and scheme `https` and the metrics published by each pod are mostly disjoint, for aggregation by a monitoring framework (e.g. Grafana), except for [service-wide metrics](#Metrics) with the scope `Deployment`. These are published by only one of the pods at a given time. +Serving runtime pods expose the endpoint `/metrics` on port `2112` and scheme `https` and the metrics published by each pod are mostly disjoint, for aggregation by a monitoring framework (e.g. Grafana), except for [service-wide metrics](#Metrics) with the scope `Deployment`. These are published by only one of the pods at a given time. Endpoints associated with a ModelMesh Serving service (`modelmesh-serving` by default) should be used to track the serving runtime pods' IPs from which Prometheus metrics should be scraped. @@ -52,10 +52,12 @@ The best way to visualize the metrics is to use Prometheus to collect them from ## Monitoring Setup ### Set up Prometheus Operator + The [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator) is the easiest way to set up both Prometheus and Grafana natively in a Kubernetes cluster. You can clone the [`kube-prometheus`](https://github.com/prometheus-operator/kube-prometheus) project and follow the [quickstart](https://github.com/prometheus-operator/kube-prometheus#quickstart) instructions to set it up. By default, the operator sets RBAC rules to enable monitoring for the `default`, `monitoring`, and `kube-system` namespaces to collect Kubernetes and node metrics. #### Monitor Additional Namespaces + To monitor the `modelmesh-serving` namespace, in the cloned `kube-prometheus` repository, add the following to `manifests/prometheus-roleBindingSpecificNamespaces.yaml`: ```yaml @@ -74,12 +76,13 @@ To monitor the `modelmesh-serving` namespace, in the cloned `kube-prometheus` re kind: Role name: prometheus-k8s subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring ``` and to `manifests/prometheus-roleSpecificNamespaces.yaml`: + ```yaml - apiVersion: rbac.authorization.k8s.io/v1 kind: Role @@ -92,35 +95,36 @@ and to `manifests/prometheus-roleSpecificNamespaces.yaml`: name: prometheus-k8s namespace: modelmesh-serving rules: - - apiGroups: - - "" - resources: - - services - - endpoints - - pods - verbs: - - get - - list - - watch - - apiGroups: - - extensions - resources: - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: - - get - - list - - watch + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch ``` #### Increase Retention Period + By default, Prometheus only keeps a 24-hour history record. To increase the retention period, modify `manifests/prometheus-prometheus.yaml` by adding: ```yaml @@ -136,14 +140,16 @@ spec: Other configurable Prometheus specification fields are listed [here](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#prometheusspec). -## Create a ServiceMonitor +## Create a ServiceMonitor -[ServiceMonitor](https://prometheus-operator.dev/docs/operator/design/#servicemonitor) is a custom resource definition provided by Prometheus Operator and is leveraged by ModelMesh for monitoring pods through the `modelmesh-serving` service. +[ServiceMonitor](https://prometheus-operator.dev/docs/operator/design/#servicemonitor) is a custom resource definition provided by Prometheus Operator and is leveraged by ModelMesh for monitoring pods through the `modelmesh-serving` service. Create a `ServiceMonitor` to monitor the `modelmesh-serving` service using the definition found [here](../config/grafana/servicemonitor.yaml). + ```bash kubectl apply -f servicemonitor.yaml ``` + After the `ServiceMonitor` is created, the Prometheus operator will dynamically discover the pods with the label `modelmesh-service: modelmesh-serving` and scrape the metrics endpoint exposed by those pods. **Note**: By default, when the ModelMesh controller is started, the `ServiceMonitor` is checked. If it exists and `metrics.enabled` is `true`, a `ServiceMonitor` resource From 4d34ee428df20fd1f2252aeac0846b36b4bc3d5e Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 5 Apr 2023 14:19:56 -0400 Subject: [PATCH 12/13] Updates in-doc links Signed-off-by: Rafael Vasquez --- docs/monitoring.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/monitoring.md b/docs/monitoring.md index f19eae77..6508238b 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -45,9 +45,9 @@ Here is the list of metrics exposed by ModelMesh Serving: The best way to visualize the metrics is to use Prometheus to collect them from targets by scraping the metrics HTTP endpoints coupled with a Grafana dashboard. Setup instructions are provided below and involve the following steps: -1. [Set up Prometheus Operator](#setup-prometheus-operator) -2. [Create the ServiceMonitor CRD](#create-the-servicemonitor-crd) -3. [Import Grafana Dashboard](#Import-Grafana-Dashboard) +1. [Set up Prometheus Operator](#set-up-prometheus-operator) +2. [Create the ServiceMonitor CRD](#create-the-servicemonitor-resource) +3. [Import Grafana Dashboard](#import-the-grafana-dashboard) ## Monitoring Setup @@ -140,7 +140,7 @@ spec: Other configurable Prometheus specification fields are listed [here](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#prometheusspec). -## Create a ServiceMonitor +## Create the ServiceMonitor Resource [ServiceMonitor](https://prometheus-operator.dev/docs/operator/design/#servicemonitor) is a custom resource definition provided by Prometheus Operator and is leveraged by ModelMesh for monitoring pods through the `modelmesh-serving` service. From 2a4ebc5b08002ba1e170ddae86ef24ae87f0a749 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 19 Apr 2023 12:30:43 -0400 Subject: [PATCH 13/13] Refactors and updates doc Signed-off-by: Rafael Vasquez --- config/{grafana => dashboard}/ModelMeshMetricsDashboard.json | 0 config/{grafana => prometheus}/servicemonitor.yaml | 0 docs/monitoring.md | 4 ++-- 3 files changed, 2 insertions(+), 2 deletions(-) rename config/{grafana => dashboard}/ModelMeshMetricsDashboard.json (100%) rename config/{grafana => prometheus}/servicemonitor.yaml (100%) diff --git a/config/grafana/ModelMeshMetricsDashboard.json b/config/dashboard/ModelMeshMetricsDashboard.json similarity index 100% rename from config/grafana/ModelMeshMetricsDashboard.json rename to config/dashboard/ModelMeshMetricsDashboard.json diff --git a/config/grafana/servicemonitor.yaml b/config/prometheus/servicemonitor.yaml similarity index 100% rename from config/grafana/servicemonitor.yaml rename to config/prometheus/servicemonitor.yaml diff --git a/docs/monitoring.md b/docs/monitoring.md index 6508238b..9876794e 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -144,7 +144,7 @@ Other configurable Prometheus specification fields are listed [here](https://git [ServiceMonitor](https://prometheus-operator.dev/docs/operator/design/#servicemonitor) is a custom resource definition provided by Prometheus Operator and is leveraged by ModelMesh for monitoring pods through the `modelmesh-serving` service. -Create a `ServiceMonitor` to monitor the `modelmesh-serving` service using the definition found [here](../config/grafana/servicemonitor.yaml). +Create a `ServiceMonitor` to monitor the `modelmesh-serving` service using the definition found [here](../config/prometheus/servicemonitor.yaml). ```bash kubectl apply -f servicemonitor.yaml @@ -159,7 +159,7 @@ If you have an alternative solution to collect the metrics, you can disable the ## Import the Grafana Dashboard -To access [Grafana](https://github.com/grafana/grafana) and visualize the Prometheus-monitored data, follow the instructions [here](https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/access-ui.md#grafana) and import the [pre-built dashboard](/config/grafana/ModelMeshMetricsDashboard.json) we provide using [this guide](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#import-a-dashboard). +To access [Grafana](https://github.com/grafana/grafana) and visualize the Prometheus-monitored data, follow the instructions [here](https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/access-ui.md#grafana) and import the [pre-built dashboard](/config/dashboard/ModelMeshMetricsDashboard.json) we provide using [this guide](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#import-a-dashboard). ## Troubleshooting