From cfdca0894b95f3ce5f69e611301f8745b9d205e9 Mon Sep 17 00:00:00 2001 From: Taleb Zeghmi <4167032+talebzeghmi@users.noreply.github.com> Date: Tue, 9 Jul 2024 23:20:42 -0700 Subject: [PATCH] feat: argo_events_action_retries_failed_total metric (#3190) Signed-off-by: Derek Wang --- .gitignore | 2 ++ docs/metrics.md | 6 ++++++ metrics/metrics.go | 15 +++++++++++++++ sensors/listener.go | 1 + 4 files changed, 24 insertions(+) diff --git a/.gitignore b/.gitignore index 0d7f404ba3..18545952b2 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ debug.test site/ /go-diagrams/ argo-events +.swo +.swp diff --git a/docs/metrics.md b/docs/metrics.md index 1c9bb91db1..4900272894 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -84,6 +84,11 @@ How many actions have been triggered successfully. How many actions failed. +#### argo_events_action_retries_failed_total + +How many actions failed after the retries have been exhausted. +This is also incremented if there is no `retryStrategy` specified. + #### argo_events_action_duration_milliseconds Action triggering duration. @@ -140,6 +145,7 @@ of monitoring your applications running with Argo Events. - `argo_events_events_processing_failed_total` - `argo_events_events_sent_failed_total` - `argo_events_action_failed_total` + - `argo_events_action_retries_failed_total` - Saturation diff --git a/metrics/metrics.go b/metrics/metrics.go index 84dc74c86c..4f114fe920 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -45,6 +45,7 @@ type Metrics struct { eventProcessingDuration *prometheus.SummaryVec actionTriggered *prometheus.CounterVec actionFailed *prometheus.CounterVec + actionRetriesFailed *prometheus.CounterVec actionDuration *prometheus.SummaryVec } @@ -108,6 +109,14 @@ func NewMetrics(namespace string) *Metrics { labelNamespace: namespace, }, }, []string{labelSensorName, labelTriggerName}), + actionRetriesFailed: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: prefix, + Name: "action_retries_failed_total", + Help: "How many actions failed after the retries have been exhausted. https://argoproj.github.io/argo-events/metrics/#action_retries_failed_total", + ConstLabels: prometheus.Labels{ + labelNamespace: namespace, + }, + }, []string{labelSensorName, labelTriggerName}), actionDuration: prometheus.NewSummaryVec(prometheus.SummaryOpts{ Namespace: prefix, Name: "action_duration_milliseconds", @@ -127,6 +136,7 @@ func (m *Metrics) Collect(ch chan<- prometheus.Metric) { m.eventProcessingDuration.Collect(ch) m.actionTriggered.Collect(ch) m.actionFailed.Collect(ch) + m.actionRetriesFailed.Collect(ch) m.actionDuration.Collect(ch) } @@ -138,6 +148,7 @@ func (m *Metrics) Describe(ch chan<- *prometheus.Desc) { m.eventProcessingDuration.Describe(ch) m.actionTriggered.Describe(ch) m.actionFailed.Describe(ch) + m.actionRetriesFailed.Describe(ch) m.actionDuration.Describe(ch) } @@ -173,6 +184,10 @@ func (m *Metrics) ActionFailed(sensorName, triggerName string) { m.actionFailed.WithLabelValues(sensorName, triggerName).Inc() } +func (m *Metrics) ActionRetriesFailed(sensorName, triggerName string) { + m.actionRetriesFailed.WithLabelValues(sensorName, triggerName).Inc() +} + func (m *Metrics) ActionDuration(sensorName, triggerName string, num float64) { m.actionDuration.WithLabelValues(sensorName, triggerName).Observe(num) } diff --git a/sensors/listener.go b/sensors/listener.go index a8f9f96582..81dc35c225 100644 --- a/sensors/listener.go +++ b/sensors/listener.go @@ -220,6 +220,7 @@ func (sensorCtx *SensorContext) listenEvents(ctx context.Context) error { }) if err != nil { triggerLogger.Warnf("failed to trigger actions, %v", err) + sensorCtx.metrics.ActionRetriesFailed(sensor.Name, trigger.Template.Name) } }