Skip to content

Commit

Permalink
Merge pull request #2 from ca-scribner/monitoring
Browse files Browse the repository at this point in the history
Update grafana/prometheus implementation for charmed operator of katib
  • Loading branch information
jardon authored Apr 12, 2022
2 parents 5afb811 + 70aa6fe commit dd7e91f
Show file tree
Hide file tree
Showing 10 changed files with 228 additions and 23 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/test-charmed-katib.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ jobs:
set -eux
sudo apt update
sudo apt install python3-setuptools
sudo pip3 install black==20.8b1 flake8
sudo pip3 install black flake8
- name: Check black
run: black --check operators
run: black --check operators/*/src

- name: Check flake8
run: cd operators && flake8
run: cd operators && flake8 ./katib*/src

build:
name: Test
Expand Down
2 changes: 1 addition & 1 deletion operators/bundle.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
bundle: kubernetes
applications:
katib-controller: { charm: katib-controller, scale: 1 }
katib-db: { charm: cs:~charmed-osm/mariadb-k8s, scale: 1, options: { database: katib } }
katib-db: { charm: charmed-osm-mariadb-k8s, scale: 1, options: { database: katib } }
katib-db-manager: { charm: katib-db-manager, scale: 1 }
katib-ui: { charm: katib-ui, scale: 1 }
relations:
Expand Down
4 changes: 2 additions & 2 deletions operators/katib-controller/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ resources:
provides:
katib-controller:
interface: http
monitoring:
metrics-endpoint:
interface: prometheus_scrape
grafana-dashboard:
interface: grafana_dashboard
interface: grafana_dashboard
11 changes: 3 additions & 8 deletions operators/katib-controller/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


class CheckFailed(Exception):
""" Raise this exception if one of the checks in main fails. """
"""Raise this exception if one of the checks in main fails."""

def __init__(self, msg, status_type=None):
super().__init__()
Expand All @@ -40,14 +40,11 @@ def __init__(self, framework):

self.prometheus_provider = MetricsEndpointProvider(
charm=self,
relation_name="monitoring",
jobs=[
{
"job_name": "katib_controller_metrics",
"scrape_interval": "30s",
"metrics_path": "/metrics",
"static_configs": [
{"targets": ["*:{}".format(self.config["metrics-port"])]}
{"targets": [f"*:{self.config['metrics-port']}"]}
],
}
],
Expand All @@ -57,10 +54,8 @@ def __init__(self, framework):
for event in [
self.on.config_changed,
self.on.install,
self.on.leader_elected,
self.on.upgrade_charm,
self.on["monitoring"].relation_changed,
self.on["monitoring"].relation_broken,
self.on["monitoring"].relation_departed,
]:
self.framework.observe(event, self.set_pod_spec)

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 4,
"links": [],
"panels": [
{
"datasource": "${prometheusds}",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 0
},
"id": 2,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"text": {}
},
"pluginVersion": "7.4.1",
"targets": [
{
"expr": "katib_experiments_current{status=\"Running\"}",
"interval": "",
"legendFormat": "",
"queryType": "randomWalk",
"refId": "A"
},
{
"expr": "katib_trials_current{status=\"Running\"}",
"hide": false,
"interval": "",
"legendFormat": "",
"refId": "B"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Current Status",
"type": "gauge"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${prometheusds}",
"fieldConfig": {
"defaults": {
"custom": {},
"unit": "/hr"
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 9
},
"hiddenSeries": false,
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.1",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "rate(katib_experiment_created_total[60m]) * 60*60",
"interval": "",
"legendFormat": "Experiments",
"queryType": "randomWalk",
"refId": "A"
},
{
"expr": "rate(katib_trial_created_total[60m]) * 60*60",
"hide": false,
"interval": "",
"legendFormat": "Trials",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Creation Rates",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "/hr",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"schemaVersion": 27,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Katib Status",
"uid": "SgVtIT87z",
"version": 1
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
alert: KatibControllerUnitIsUnavailable
expr: up < 1
for: 0m
labels:
severity: critical
annotations:
summary: Katib-controller unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} unavailable
description: >
The Katib-controller unit {{ $labels.juju_model }} {{ $labels.juju_unit }} is unavailable
LABELS = {{ $labels }}
2 changes: 1 addition & 1 deletion operators/katib-db-manager/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


class CheckFailed(Exception):
""" Raise this exception if one of the checks in main fails. """
"""Raise this exception if one of the checks in main fails."""

def __init__(self, msg, status_type=None):
super().__init__()
Expand Down
2 changes: 1 addition & 1 deletion operators/katib-ui/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@


class CheckFailed(Exception):
""" Raise this exception if one of the checks in main fails. """
"""Raise this exception if one of the checks in main fails."""

def __init__(self, msg, status_type=None):
super().__init__()
Expand Down

0 comments on commit dd7e91f

Please sign in to comment.