From a02b0a85c04242051a95c75fb66bd6086d28eeb7 Mon Sep 17 00:00:00 2001 From: Kyle Huntsman <3432646+kylehuntsman@users.noreply.github.com> Date: Thu, 22 Sep 2022 00:49:52 -0700 Subject: [PATCH] feat(monitoring): add http response code metrics to dashboard --- cmd/booster-http/server.go | 14 +- .../dashboards/exported_dashboard.json | 390 +++++++++++++++--- metrics/metrics.go | 56 ++- 3 files changed, 386 insertions(+), 74 deletions(-) diff --git a/cmd/booster-http/server.go b/cmd/booster-http/server.go index 73bfc9e50..915d5f691 100644 --- a/cmd/booster-http/server.go +++ b/cmd/booster-http/server.go @@ -162,6 +162,7 @@ func (s *HttpServer) handleByPayloadCid(w http.ResponseWriter, r *http.Request) if len(r.URL.Path) <= prefixLen { msg := fmt.Sprintf("path '%s' is missing payload CID", r.URL.Path) writeError(w, r, http.StatusBadRequest, msg) + stats.Record(ctx, metrics.HttpPayloadByCid400ResponseCount.M(1)) return } @@ -172,6 +173,7 @@ func (s *HttpServer) handleByPayloadCid(w http.ResponseWriter, r *http.Request) if err != nil { msg := fmt.Sprintf("parsing payload CID '%s': %s", payloadCidStr, err.Error()) writeError(w, r, http.StatusBadRequest, msg) + stats.Record(ctx, metrics.HttpPayloadByCid400ResponseCount.M(1)) return } @@ -181,11 +183,13 @@ func (s *HttpServer) handleByPayloadCid(w http.ResponseWriter, r *http.Request) if isNotFoundError(err) { msg := fmt.Sprintf("getting piece that contains payload CID '%s': %s", payloadCid, err.Error()) writeError(w, r, http.StatusNotFound, msg) + stats.Record(ctx, metrics.HttpPayloadByCid404ResponseCount.M(1)) return } log.Errorf("getting piece that contains payload CID '%s': %s", payloadCid, err) msg := fmt.Sprintf("server error getting piece that contains payload CID '%s'", payloadCidStr) writeError(w, r, http.StatusInternalServerError, msg) + stats.Record(ctx, metrics.HttpPayloadByCid500ResponseCount.M(1)) return } @@ -200,11 +204,13 @@ func (s *HttpServer) handleByPayloadCid(w http.ResponseWriter, r *http.Request) if isNotFoundError(err) { msg := fmt.Sprintf("getting content for payload CID %s in piece %s: %s", payloadCidStr, pieceCid, err) writeError(w, r, http.StatusNotFound, msg) + stats.Record(ctx, metrics.HttpPayloadByCid404ResponseCount.M(1)) return } log.Errorf("getting content for payload CID %s in piece %s: %s", payloadCid, pieceCid, err) msg := fmt.Sprintf("server error getting content for payload CID %s in piece %s", payloadCidStr, pieceCid) writeError(w, r, http.StatusInternalServerError, msg) + stats.Record(ctx, metrics.HttpPayloadByCid500ResponseCount.M(1)) return } @@ -217,7 +223,7 @@ func (s *HttpServer) handleByPayloadCid(w http.ResponseWriter, r *http.Request) serveContent(w, r, content, getContentType(isCar)) - // Record retrieval duration + stats.Record(ctx, metrics.HttpPayloadByCid200ResponseCount.M(1)) stats.Record(ctx, metrics.HttpPayloadByCidRequestDuration.M(float64(time.Since(startTime).Milliseconds()))) } @@ -232,6 +238,7 @@ func (s *HttpServer) handleByPieceCid(w http.ResponseWriter, r *http.Request) { if len(r.URL.Path) <= prefixLen { msg := fmt.Sprintf("path '%s' is missing piece CID", r.URL.Path) writeError(w, r, http.StatusBadRequest, msg) + stats.Record(ctx, metrics.HttpPieceByCid400ResponseCount.M(1)) return } @@ -242,6 +249,7 @@ func (s *HttpServer) handleByPieceCid(w http.ResponseWriter, r *http.Request) { if err != nil { msg := fmt.Sprintf("parsing piece CID '%s': %s", pieceCidStr, err.Error()) writeError(w, r, http.StatusBadRequest, msg) + stats.Record(ctx, metrics.HttpPieceByCid400ResponseCount.M(1)) return } @@ -253,11 +261,13 @@ func (s *HttpServer) handleByPieceCid(w http.ResponseWriter, r *http.Request) { if err != nil { if isNotFoundError(err) { writeError(w, r, http.StatusNotFound, err.Error()) + stats.Record(ctx, metrics.HttpPieceByCid404ResponseCount.M(1)) return } log.Errorf("getting content for piece %s: %s", pieceCid, err) msg := fmt.Sprintf("server error getting content for piece CID %s", pieceCidStr) writeError(w, r, http.StatusInternalServerError, msg) + stats.Record(ctx, metrics.HttpPieceByCid500ResponseCount.M(1)) return } @@ -270,7 +280,7 @@ func (s *HttpServer) handleByPieceCid(w http.ResponseWriter, r *http.Request) { serveContent(w, r, content, getContentType(isCar)) - // Record retrieval duration + stats.Record(ctx, metrics.HttpPieceByCid200ResponseCount.M(1)) stats.Record(ctx, metrics.HttpPieceByCidRequestDuration.M(float64(time.Since(startTime).Milliseconds()))) } diff --git a/docker/monitoring/grafana/dashboards/exported_dashboard.json b/docker/monitoring/grafana/dashboards/exported_dashboard.json index 44eb220b9..8ff390b92 100644 --- a/docker/monitoring/grafana/dashboards/exported_dashboard.json +++ b/docker/monitoring/grafana/dashboards/exported_dashboard.json @@ -198,8 +198,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -387,8 +386,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -638,8 +636,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -734,8 +731,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -831,8 +827,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -928,8 +923,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1024,8 +1018,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1120,8 +1113,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1216,8 +1208,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1445,6 +1436,7 @@ "mode": "off" } }, + "displayName": "Requests/s", "mappings": [], "thresholds": { "mode": "absolute", @@ -1452,12 +1444,41 @@ { "color": "green", "value": null + }, + { + "color": "red", + "value": 80 } ] }, - "unit": "ms" + "unit": "none" }, - "overrides": [] + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "Requests/s" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] }, "gridPos": { "h": 7, @@ -1465,11 +1486,13 @@ "x": 12, "y": 11 }, - "id": 5, + "id": 3, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "last" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, @@ -1478,6 +1501,7 @@ "sort": "none" } }, + "pluginVersion": "9.2.0-77684pre", "targets": [ { "datasource": { @@ -1485,14 +1509,13 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(booster_http_http_payload_by_cid_request_duration_ms_bucket{}[$__rate_interval])) by (le))", - "format": "time_series", + "expr": "sum(rate(booster_http_http_piece_by_cid_request_count{}[$__rate_interval]))", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Payload by CID Request Durations", + "title": "Piece by CID Requests", "type": "timeseries" }, { @@ -1535,7 +1558,6 @@ "mode": "off" } }, - "displayName": "Requests/s", "mappings": [], "thresholds": { "mode": "absolute", @@ -1552,55 +1574,285 @@ }, "unit": "none" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "Requests/s" - ], - "prefix": "All except:", - "readOnly": true - } + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 36, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(booster_http_http_payload_by_cid_200_response_count{}[$__rate_interval]))", + "legendFormat": "200 | Ok", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(booster_http_http_payload_by_cid_400_response_count{}[$__rate_interval]))", + "hide": false, + "legendFormat": "400 | Bad Request", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(booster_http_http_payload_by_cid_404_response_count{}[$__rate_interval]))", + "hide": false, + "legendFormat": "404 | Not Found", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(booster_http_http_payload_by_cid_500_response_count{}[$__rate_interval]))", + "hide": false, + "legendFormat": "500 | Internal Server Error", + "range": true, + "refId": "D" + } + ], + "title": "Payload by CID Status Response Counts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "properties": [ + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 } ] - } - ] + }, + "unit": "none" + }, + "overrides": [] }, "gridPos": { - "h": 7, + "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 18 }, - "id": 3, + "id": 37, "options": { "legend": { "calcs": [ - "last" + "lastNotNull" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(booster_http_http_piece_by_cid_200_response_count{}[$__rate_interval]))", + "legendFormat": "200 | Ok", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(booster_http_http_piece_by_cid_400_response_count{}[$__rate_interval]))", + "hide": false, + "legendFormat": "400 | Bad Request", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(booster_http_http_piece_by_cid_404_response_count{}[$__rate_interval]))", + "hide": false, + "legendFormat": "404 | Not Found", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(booster_http_http_piece_by_cid_500_response_count{}[$__rate_interval]))", + "hide": false, + "legendFormat": "500 | Internal Server Error", + "range": true, + "refId": "D" + } + ], + "title": "Piece by CID Status Response Counts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, "tooltip": { "mode": "single", "sort": "none" } }, - "pluginVersion": "9.2.0-77684pre", "targets": [ { "datasource": { @@ -1608,13 +1860,14 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "sum(rate(booster_http_http_piece_by_cid_request_count{}[$__rate_interval]))", + "expr": "histogram_quantile(0.95, sum(rate(booster_http_http_payload_by_cid_request_duration_ms_bucket{}[$__rate_interval])) by (le))", + "format": "time_series", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Piece by CID Requests", + "title": "Payload by CID Request Durations", "type": "timeseries" }, { @@ -1675,7 +1928,7 @@ "h": 7, "w": 12, "x": 12, - "y": 18 + "y": 26 }, "id": 6, "options": { @@ -1713,7 +1966,7 @@ "h": 1, "w": 24, "x": 0, - "y": 25 + "y": 33 }, "id": 28, "panels": [ @@ -1779,7 +2032,7 @@ "h": 8, "w": 12, "x": 0, - "y": 20 + "y": 18 }, "id": 26, "links": [], @@ -1967,7 +2220,7 @@ "h": 8, "w": 12, "x": 12, - "y": 20 + "y": 18 }, "id": 12, "links": [], @@ -2069,7 +2322,7 @@ "h": 8, "w": 12, "x": 0, - "y": 28 + "y": 26 }, "hiddenSeries": false, "id": 24, @@ -2217,7 +2470,7 @@ "h": 8, "w": 12, "x": 12, - "y": 28 + "y": 26 }, "id": 16, "links": [], @@ -2312,7 +2565,7 @@ "h": 8, "w": 12, "x": 0, - "y": 36 + "y": 34 }, "id": 22, "links": [], @@ -2408,7 +2661,7 @@ "h": 8, "w": 12, "x": 12, - "y": 36 + "y": 34 }, "id": 20, "links": [], @@ -2504,7 +2757,7 @@ "h": 8, "w": 12, "x": 0, - "y": 44 + "y": 42 }, "id": 18, "links": [], @@ -2599,7 +2852,7 @@ "h": 8, "w": 12, "x": 12, - "y": 44 + "y": 42 }, "id": 8, "links": [], @@ -2694,7 +2947,7 @@ "h": 8, "w": 12, "x": 0, - "y": 52 + "y": 50 }, "id": 14, "links": [], @@ -2789,7 +3042,7 @@ "h": 8, "w": 12, "x": 12, - "y": 52 + "y": 50 }, "id": 4, "links": [], @@ -2827,6 +3080,7 @@ "type": "row" } ], + "refresh": false, "schemaVersion": 37, "style": "dark", "tags": [], diff --git a/metrics/metrics.go b/metrics/metrics.go index 3741e752c..2b8a05e0f 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -115,10 +115,18 @@ var ( SplitstoreCompactionDead = stats.Int64("splitstore/dead", "Number of dead blocks in last compaction", stats.UnitDimensionless) // http - HttpPayloadByCidRequestCount = stats.Int64("http/payload_by_cid_request_count", "Counter of /payload/ requests", stats.UnitDimensionless) - HttpPayloadByCidRequestDuration = stats.Float64("http/payload_by_cid_request_duration_ms", "Time spent retrieving a payload by cid", stats.UnitMilliseconds) - HttpPieceByCidRequestCount = stats.Int64("http/piece_by_cid_request_count", "Counter of /piece/ requests", stats.UnitDimensionless) - HttpPieceByCidRequestDuration = stats.Float64("http/piece_by_cid_request_duration_ms", "Time spent retrieving a piece by cid", stats.UnitMilliseconds) + HttpPayloadByCidRequestCount = stats.Int64("http/payload_by_cid_request_count", "Counter of /payload/ requests", stats.UnitDimensionless) + HttpPayloadByCidRequestDuration = stats.Float64("http/payload_by_cid_request_duration_ms", "Time spent retrieving a payload by cid", stats.UnitMilliseconds) + HttpPayloadByCid200ResponseCount = stats.Int64("http/payload_by_cid_200_response_count", "Counter of /payload/ 200 responses", stats.UnitDimensionless) + HttpPayloadByCid400ResponseCount = stats.Int64("http/payload_by_cid_400_response_count", "Counter of /payload/ 400 responses", stats.UnitDimensionless) + HttpPayloadByCid404ResponseCount = stats.Int64("http/payload_by_cid_404_response_count", "Counter of /payload/ 404 responses", stats.UnitDimensionless) + HttpPayloadByCid500ResponseCount = stats.Int64("http/payload_by_cid_500_response_count", "Counter of /payload/ 500 responses", stats.UnitDimensionless) + HttpPieceByCidRequestCount = stats.Int64("http/piece_by_cid_request_count", "Counter of /piece/ requests", stats.UnitDimensionless) + HttpPieceByCidRequestDuration = stats.Float64("http/piece_by_cid_request_duration_ms", "Time spent retrieving a piece by cid", stats.UnitMilliseconds) + HttpPieceByCid200ResponseCount = stats.Int64("http/piece_by_cid_200_response_count", "Counter of /piece/ 200 responses", stats.UnitDimensionless) + HttpPieceByCid400ResponseCount = stats.Int64("http/piece_by_cid_400_response_count", "Counter of /piece/ 400 responses", stats.UnitDimensionless) + HttpPieceByCid404ResponseCount = stats.Int64("http/piece_by_cid_404_response_count", "Counter of /piece/ 404 responses", stats.UnitDimensionless) + HttpPieceByCid500ResponseCount = stats.Int64("http/piece_by_cid_500_response_count", "Counter of /piece/ 500 responses", stats.UnitDimensionless) ) var ( @@ -131,6 +139,22 @@ var ( Measure: HttpPayloadByCidRequestDuration, Aggregation: defaultMillisecondsDistribution, } + HttpPayloadByCid200ResponseCountView = &view.View{ + Measure: HttpPayloadByCid200ResponseCount, + Aggregation: view.Count(), + } + HttpPayloadByCid400ResponseCountView = &view.View{ + Measure: HttpPayloadByCid400ResponseCount, + Aggregation: view.Count(), + } + HttpPayloadByCid404ResponseCountView = &view.View{ + Measure: HttpPayloadByCid404ResponseCount, + Aggregation: view.Count(), + } + HttpPayloadByCid500ResponseCountView = &view.View{ + Measure: HttpPayloadByCid500ResponseCount, + Aggregation: view.Count(), + } HttpPieceByCidRequestCountView = &view.View{ Measure: HttpPieceByCidRequestCount, Aggregation: view.Count(), @@ -139,6 +163,22 @@ var ( Measure: HttpPieceByCidRequestDuration, Aggregation: defaultMillisecondsDistribution, } + HttpPieceByCid200ResponseCountView = &view.View{ + Measure: HttpPieceByCid200ResponseCount, + Aggregation: view.Count(), + } + HttpPieceByCid400ResponseCountView = &view.View{ + Measure: HttpPieceByCid400ResponseCount, + Aggregation: view.Count(), + } + HttpPieceByCid404ResponseCountView = &view.View{ + Measure: HttpPieceByCid404ResponseCount, + Aggregation: view.Count(), + } + HttpPieceByCid500ResponseCountView = &view.View{ + Measure: HttpPieceByCid500ResponseCount, + Aggregation: view.Count(), + } InfoView = &view.View{ Name: "info", @@ -412,8 +452,16 @@ var DefaultViews = func() []*view.View { APIRequestDurationView, HttpPayloadByCidRequestCountView, HttpPayloadByCidRequestDurationView, + HttpPayloadByCid200ResponseCountView, + HttpPayloadByCid400ResponseCountView, + HttpPayloadByCid404ResponseCountView, + HttpPayloadByCid500ResponseCountView, HttpPieceByCidRequestCountView, HttpPieceByCidRequestDurationView, + HttpPieceByCid200ResponseCountView, + HttpPieceByCid400ResponseCountView, + HttpPieceByCid404ResponseCountView, + HttpPieceByCid500ResponseCountView, } //views = append(views, blockstore.DefaultViews...) views = append(views, rpcmetrics.DefaultViews...)