Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: argo_events_action_retries_failed_total metric #3190

Merged
merged 1 commit into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ debug.test
site/
/go-diagrams/
argo-events
.swo
.swp
6 changes: 6 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ How many actions have been triggered successfully.

How many actions failed.

#### argo_events_action_retries_failed_total

How many actions failed after the retries have been exhausted.
This is also incremented if there is no `retryStrategy` specified.

#### argo_events_action_duration_milliseconds

Action triggering duration.
Expand Down Expand Up @@ -140,6 +145,7 @@ of monitoring your applications running with Argo Events.
- `argo_events_events_processing_failed_total`
- `argo_events_events_sent_failed_total`
- `argo_events_action_failed_total`
- `argo_events_action_retries_failed_total`

- Saturation

Expand Down
15 changes: 15 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ type Metrics struct {
eventProcessingDuration *prometheus.SummaryVec
actionTriggered *prometheus.CounterVec
actionFailed *prometheus.CounterVec
actionRetriesFailed *prometheus.CounterVec
actionDuration *prometheus.SummaryVec
}

Expand Down Expand Up @@ -108,6 +109,14 @@ func NewMetrics(namespace string) *Metrics {
labelNamespace: namespace,
},
}, []string{labelSensorName, labelTriggerName}),
actionRetriesFailed: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: prefix,
Name: "action_retries_failed_total",
Help: "How many actions failed after the retries have been exhausted. https://argoproj.github.io/argo-events/metrics/#action_retries_failed_total",
ConstLabels: prometheus.Labels{
labelNamespace: namespace,
},
}, []string{labelSensorName, labelTriggerName}),
actionDuration: prometheus.NewSummaryVec(prometheus.SummaryOpts{
Namespace: prefix,
Name: "action_duration_milliseconds",
Expand All @@ -127,6 +136,7 @@ func (m *Metrics) Collect(ch chan<- prometheus.Metric) {
m.eventProcessingDuration.Collect(ch)
m.actionTriggered.Collect(ch)
m.actionFailed.Collect(ch)
m.actionRetriesFailed.Collect(ch)
m.actionDuration.Collect(ch)
}

Expand All @@ -138,6 +148,7 @@ func (m *Metrics) Describe(ch chan<- *prometheus.Desc) {
m.eventProcessingDuration.Describe(ch)
m.actionTriggered.Describe(ch)
m.actionFailed.Describe(ch)
m.actionRetriesFailed.Describe(ch)
m.actionDuration.Describe(ch)
}

Expand Down Expand Up @@ -173,6 +184,10 @@ func (m *Metrics) ActionFailed(sensorName, triggerName string) {
m.actionFailed.WithLabelValues(sensorName, triggerName).Inc()
}

func (m *Metrics) ActionRetriesFailed(sensorName, triggerName string) {
m.actionRetriesFailed.WithLabelValues(sensorName, triggerName).Inc()
}

func (m *Metrics) ActionDuration(sensorName, triggerName string, num float64) {
m.actionDuration.WithLabelValues(sensorName, triggerName).Observe(num)
}
Expand Down
1 change: 1 addition & 0 deletions sensors/listener.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ func (sensorCtx *SensorContext) listenEvents(ctx context.Context) error {
})
if err != nil {
triggerLogger.Warnf("failed to trigger actions, %v", err)
sensorCtx.metrics.ActionRetriesFailed(sensor.Name, trigger.Template.Name)
}
}

Expand Down
Loading