Skip to content
This repository has been archived by the owner on Sep 19, 2022. It is now read-only.

Commit

Permalink
add total suffix in counter metrics
Browse files Browse the repository at this point in the history
Signed-off-by: yeya24 <yb532204897@gmail.com>
  • Loading branch information
yeya24 committed Aug 7, 2019
1 parent ca2c9c4 commit 8045f0a
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 12 deletions.
14 changes: 8 additions & 6 deletions docs/monitoring/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,32 +56,34 @@ pytorch_operator_is_leader

### Report PyTorch Job metrics:

*Note*: If you are using release v1 pytorch-operator, these PyTorch metrics don't have suffix `total`. So you have to use metric name like `pytorch_operator_jobs_created` to get your metrics. See [PR](https://github.com/kubeflow/tf-operator/pull/1055) to get more information.

**Job Creation**
```
pytorch_operator_jobs_created
pytorch_operator_jobs_created_total
```

**Job Creation**
```
sum (rate (pytorch_operator_jobs_created[60m]))
sum (rate (pytorch_operator_jobs_created_total[60m]))
```

**Job Deletion**
```
pytorch_operator_jobs_deleted
pytorch_operator_jobs_deleted_total
```

**Successful Job Completions**
```
pytorch_operator_jobs_successful
pytorch_operator_jobs_successful_total
```

**Failed Jobs**
```
pytorch_operator_jobs_failed
pytorch_operator_jobs_failed_total
```

**Restarted Jobs**
```
pytorch_operator_jobs_restarted
pytorch_operator_jobs_restarted_total
```
2 changes: 1 addition & 1 deletion pkg/controller.v1/pytorch/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ var (
}

pytorchJobsDeletedCount = promauto.NewCounter(prometheus.CounterOpts{
Name: "pytorch_operator_jobs_deleted",
Name: "pytorch_operator_jobs_deleted_total",
Help: "Counts number of PyTorch jobs deleted",
})
)
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller.v1/pytorch/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ const (

var (
pytorchJobsCreatedCount = promauto.NewCounter(prometheus.CounterOpts{
Name: "pytorch_operator_jobs_created",
Name: "pytorch_operator_jobs_created_total",
Help: "Counts number of PyTorch jobs created",
})
)
Expand All @@ -49,7 +49,7 @@ func (pc *PyTorchController) addPyTorchJob(obj interface{}) {

status := common.JobStatus{
Conditions: []common.JobCondition{
common.JobCondition{
{
Type: common.JobFailed,
Status: v1.ConditionTrue,
LastUpdateTime: metav1.Now(),
Expand Down
6 changes: 3 additions & 3 deletions pkg/controller.v1/pytorch/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@ const (

var (
pytorchJobsSuccessCount = promauto.NewCounter(prometheus.CounterOpts{
Name: "pytorch_operator_jobs_successful",
Name: "pytorch_operator_jobs_successful_total",
Help: "Counts number of PyTorch jobs successful",
})
pytorchJobsFailureCount = promauto.NewCounter(prometheus.CounterOpts{
Name: "pytorch_operator_jobs_failed",
Name: "pytorch_operator_jobs_failed_total",
Help: "Counts number of PyTorch jobs failed",
})
pytorchJobsRestartCount = promauto.NewCounter(prometheus.CounterOpts{
Name: "pytorch_operator_jobs_restarted",
Name: "pytorch_operator_jobs_restarted_total",
Help: "Counts number of PyTorch jobs restarted",
})
)
Expand Down

0 comments on commit 8045f0a

Please sign in to comment.