Skip to content

Commit

Permalink
feat: argo_events_action_retries_failed_total metric
Browse files Browse the repository at this point in the history
Signed-off-by: Taleb Zeghmi <talebz@zillowgroup.com>
  • Loading branch information
talebzeghmi committed Jul 2, 2024
1 parent 58617e6 commit 247349c
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ debug.test
site/
/go-diagrams/
argo-events
.swo
.swp
6 changes: 6 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ How many actions have been triggered successfully.

How many actions failed.

#### argo_events_action_retries_failed_total

How many actions failed after the retries have been exhausted.
This is also incremented if there is no `retryStrategy` specified.

#### argo_events_action_duration_milliseconds

Action triggering duration.
Expand Down Expand Up @@ -140,6 +145,7 @@ of monitoring your applications running with Argo Events.
- `argo_events_events_processing_failed_total`
- `argo_events_events_sent_failed_total`
- `argo_events_action_failed_total`
- `argo_events_action_retries_failed_total`

- Saturation

Expand Down
15 changes: 15 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ type Metrics struct {
eventProcessingDuration *prometheus.SummaryVec
actionTriggered *prometheus.CounterVec
actionFailed *prometheus.CounterVec
actionRetriesFailed *prometheus.CounterVec
actionDuration *prometheus.SummaryVec
}

Expand Down Expand Up @@ -108,6 +109,14 @@ func NewMetrics(namespace string) *Metrics {
labelNamespace: namespace,
},
}, []string{labelSensorName, labelTriggerName}),
actionRetriesFailed: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: prefix,
Name: "action_retries_failed_total",
Help: "How many actions failed after the retries have been exhausted. https://argoproj.github.io/argo-events/metrics/#action_retries_failed_total",
ConstLabels: prometheus.Labels{
labelNamespace: namespace,
},
}, []string{labelSensorName, labelTriggerName}),
actionDuration: prometheus.NewSummaryVec(prometheus.SummaryOpts{
Namespace: prefix,
Name: "action_duration_milliseconds",
Expand All @@ -127,6 +136,7 @@ func (m *Metrics) Collect(ch chan<- prometheus.Metric) {
m.eventProcessingDuration.Collect(ch)
m.actionTriggered.Collect(ch)
m.actionFailed.Collect(ch)
m.actionRetriesFailed.Collect(ch)
m.actionDuration.Collect(ch)
}

Expand All @@ -138,6 +148,7 @@ func (m *Metrics) Describe(ch chan<- *prometheus.Desc) {
m.eventProcessingDuration.Describe(ch)
m.actionTriggered.Describe(ch)
m.actionFailed.Describe(ch)
m.actionRetriesFailed.Describe(ch)
m.actionDuration.Describe(ch)
}

Expand Down Expand Up @@ -173,6 +184,10 @@ func (m *Metrics) ActionFailed(sensorName, triggerName string) {
m.actionFailed.WithLabelValues(sensorName, triggerName).Inc()
}

func (m *Metrics) ActionRetriesFailed(sensorName, triggerName string) {
m.actionRetriesFailed.WithLabelValues(sensorName, triggerName).Inc()
}

func (m *Metrics) ActionDuration(sensorName, triggerName string, num float64) {
m.actionDuration.WithLabelValues(sensorName, triggerName).Observe(num)
}
Expand Down
1 change: 1 addition & 0 deletions sensors/listener.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ func (sensorCtx *SensorContext) listenEvents(ctx context.Context) error {
})
if err != nil {
triggerLogger.Warnf("failed to trigger actions, %v", err)
sensorCtx.metrics.ActionRetriesFailed(sensor.Name, trigger.Template.Name)
}
}

Expand Down

0 comments on commit 247349c

Please sign in to comment.