Skip to content

Commit

Permalink
Set ActiveStatus when retention size config option is valid (#561)
Browse files Browse the repository at this point in the history
* Fix status for retention config option
* fetch-lib
* Refactor status setting

Co-authored-by: Ben Hoyt <benhoyt@gmail.com>
  • Loading branch information
sed-i and benhoyt committed Jan 29, 2024
1 parent 4573def commit 4ca83b6
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 36 deletions.
122 changes: 91 additions & 31 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import socket
import subprocess
from pathlib import Path
from typing import Dict, Optional, cast
from typing import Dict, Optional, Tuple, TypedDict, cast
from urllib.parse import urlparse

import yaml
Expand Down Expand Up @@ -46,6 +46,7 @@
from lightkube.core.client import Client
from lightkube.core.exceptions import ApiError as LightkubeApiError
from lightkube.resources.core_v1 import PersistentVolumeClaim, Pod
from ops import CollectStatusEvent, StoredState
from ops.charm import ActionEvent, CharmBase
from ops.main import main
from ops.model import (
Expand All @@ -54,6 +55,7 @@
MaintenanceStatus,
ModelError,
OpenedPort,
StatusBase,
WaitingStatus,
)
from ops.pebble import Error as PebbleError
Expand Down Expand Up @@ -91,6 +93,27 @@ class ConfigError(Exception):
pass


class CompositeStatus(TypedDict):
"""Per-component status holder."""

# These are going to go into stored state, so we must use marshallable objects.
# They are passed to StatusBase.from_name().
retention_size: Tuple[str, str]
k8s_patch: Tuple[str, str]
config: Tuple[str, str]


def to_tuple(status: StatusBase) -> Tuple[str, str]:
"""Convert a StatusBase to tuple, so it is marshallable into StoredState."""
return status.name, status.message


def to_status(tpl: Tuple[str, str]) -> StatusBase:
"""Convert a tuple to a StatusBase, so it could be used natively with ops."""
name, message = tpl
return StatusBase.from_name(name, message)


@trace_charm(
tracing_endpoint="tempo",
extra_types=[
Expand All @@ -104,9 +127,21 @@ class ConfigError(Exception):
class PrometheusCharm(CharmBase):
"""A Juju Charm for Prometheus."""

_stored = StoredState()

def __init__(self, *args):
super().__init__(*args)

# Prometheus has a mix of pull and push statuses. We need stored state for push statuses.
# https://discourse.charmhub.io/t/its-probably-ok-for-a-unit-to-go-into-error-state/13022
self._stored.set_default(
status=CompositeStatus(
retention_size=to_tuple(ActiveStatus()),
k8s_patch=to_tuple(ActiveStatus()),
config=to_tuple(ActiveStatus()),
)
)

self._name = "prometheus"
self._port = 9090
self.container = self.unit.get_container(self._name)
Expand Down Expand Up @@ -191,6 +226,17 @@ def __init__(self, *args):
self.framework.observe(self.alertmanager_consumer.on.cluster_changed, self._configure)
self.framework.observe(self.resources_patch.on.patch_failed, self._on_k8s_patch_failed)
self.framework.observe(self.on.validate_configuration_action, self._on_validate_config)
self.framework.observe(self.on.collect_unit_status, self._on_collect_unit_status)

def _on_collect_unit_status(self, event: CollectStatusEvent):
# "Pull" statuses
retention_time = self.model.config.get("metrics_retention_time", "")
if not self._is_valid_timespec(retention_time):
event.add_status(BlockedStatus(f"Invalid time spec : {retention_time}"))

# "Push" statuses
for status in self._stored.status.values():
event.add_status(to_status(status))

def set_ports(self):
"""Open necessary (and close no longer needed) workload ports."""
Expand Down Expand Up @@ -379,7 +425,7 @@ def _on_ingress_revoked(self, event: IngressPerUnitRevokedForUnitEvent):
self._configure(event)

def _on_k8s_patch_failed(self, event: K8sResourcePatchFailedEvent):
self.unit.status = BlockedStatus(cast(str, event.message))
self._stored.status["k8s_patch"] = to_tuple(BlockedStatus(cast(str, event.message)))

def _on_server_cert_changed(self, _):
self._update_cert()
Expand Down Expand Up @@ -476,13 +522,21 @@ def _configure(self, _):
),
}

if not self.resources_patch.is_ready():
if isinstance(self.unit.status, ActiveStatus) or self.unit.status.message == "":
self.unit.status = WaitingStatus("Waiting for resource limit patch to apply")
# "is_ready" is a racy check, so we do it once here (instead of in collect-status)
if self.resources_patch.is_ready():
self._stored.status["k8s_patch"] = to_tuple(ActiveStatus())
else:
if isinstance(to_status(self._stored.status["k8s_patch"]), ActiveStatus):
self._stored.status["k8s_patch"] = to_tuple(
WaitingStatus("Waiting for resource limit patch to apply")
)
return

if not self.container.can_connect():
self.unit.status = MaintenanceStatus("Configuring Prometheus")
# "can_connect" is a racy check, so we do it once here (instead of in collect-status)
if self.container.can_connect():
self._stored.status["config"] = to_tuple(ActiveStatus())
else:
self._stored.status["config"] = to_tuple(MaintenanceStatus("Configuring Prometheus"))
return

if self._is_cert_available() and not self._is_tls_ready():
Expand All @@ -508,32 +562,38 @@ def _configure(self, _):
)
except ConfigError as e:
logger.error("Failed to generate configuration: %s", e)
self.unit.status = BlockedStatus(str(e))
self._stored.status["config"] = to_tuple(BlockedStatus(str(e)))
return
except PebbleError as e:
logger.error("Failed to push updated config/alert files: %s", e)
self.unit.status = early_return_statuses["push_fail"]
self._stored.status["config"] = to_tuple(early_return_statuses["push_fail"])
return
else:
self._stored.status["config"] = to_tuple(ActiveStatus())

try:
layer_changed = self._update_layer()
except (TypeError, PebbleError) as e:
logger.error("Failed to update prometheus service: %s", e)
self.unit.status = early_return_statuses["layer_fail"]
self._stored.status["config"] = to_tuple(early_return_statuses["layer_fail"])
return
else:
self._stored.status["config"] = to_tuple(ActiveStatus())

try:
output, err = self._promtool_check_config()
if err:
logger.error(
"Invalid prometheus configuration. Stdout: %s Stderr: %s", output, err
)
self.unit.status = early_return_statuses["config_invalid"]
self._stored.status["config"] = to_tuple(early_return_statuses["config_invalid"])
return
except PebbleError as e:
logger.error("Failed to validate prometheus config: %s", e)
self.unit.status = early_return_statuses["validation_fail"]
self._stored.status["config"] = to_tuple(early_return_statuses["validation_fail"])
return
else:
self._stored.status["config"] = to_tuple(ActiveStatus())

try:
# If a config is invalid then prometheus would exit immediately.
Expand All @@ -547,30 +607,25 @@ def _configure(self, _):
self._prometheus_layer.to_dict(),
e,
)
self.unit.status = early_return_statuses["restart_fail"]
self._stored.status["config"] = to_tuple(early_return_statuses["restart_fail"])
return
else:
self._stored.status["config"] = to_tuple(ActiveStatus())

# We only need to reload if pebble didn't replan (if pebble replanned, then new config
# would be picked up on startup anyway).
if not layer_changed and should_reload:
reloaded = self._prometheus_client.reload_configuration()
if not reloaded:
logger.error("Prometheus failed to reload the configuration")
self.unit.status = early_return_statuses["cfg_load_fail"]
self._stored.status["config"] = to_tuple(early_return_statuses["cfg_load_fail"])
return
if reloaded == "read_timeout":
self.unit.status = early_return_statuses["cfg_load_timeout"]
self._stored.status["config"] = to_tuple(early_return_statuses["cfg_load_timeout"])
return

logger.info("Prometheus configuration reloaded")

if (
isinstance(self.unit.status, BlockedStatus)
and self.unit.status not in early_return_statuses.values()
):
return

self.unit.status = ActiveStatus()
self._stored.status["config"] = to_tuple(ActiveStatus())

def _on_pebble_ready(self, event) -> None:
"""Pebble ready hook.
Expand Down Expand Up @@ -681,7 +736,9 @@ def _generate_command(self) -> str:

except ValueError as e:
logger.warning(e)
self.unit.status = BlockedStatus(f"Invalid retention size: {e}")
self._stored.status["retention_size"] = to_tuple(
BlockedStatus(f"Invalid retention size: {e}")
)

else:
# `storage.tsdb.retention.size` uses the legacy binary format, so "GB" and not "GiB"
Expand All @@ -692,15 +749,20 @@ def _generate_command(self) -> str:
self._get_pvc_capacity(), ratio
)
except ValueError as e:
self.unit.status = BlockedStatus(f"Error calculating retention size: {e}")
self._stored.status["retention_size"] = to_tuple(
BlockedStatus(f"Error calculating retention size: {e}")
)
except LightkubeApiError as e:
self.unit.status = BlockedStatus(
"Error calculating retention size "
f"(try running `juju trust` on this application): {e}"
self._stored.status["retention_size"] = to_tuple(
BlockedStatus(
"Error calculating retention size "
f"(try running `juju trust` on this application): {e}"
)
)
else:
logger.debug("Retention size limit set to %s (%s%%)", capacity, ratio * 100)
args.append(f"--storage.tsdb.retention.size={capacity}")
self._stored.status["retention_size"] = to_tuple(ActiveStatus())

command = ["/bin/prometheus"] + args

Expand Down Expand Up @@ -807,9 +869,7 @@ def _is_valid_timespec(self, timeval: str) -> bool:
timespec_re = re.compile(
r"^((([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?|0)$"
)
if not (matched := timespec_re.search(timeval)):
self.unit.status = BlockedStatus(f"Invalid time spec : {timeval}")

matched = timespec_re.search(timeval)
return bool(matched)

def _percent_string_to_ratio(self, percentage: str) -> float:
Expand Down
55 changes: 50 additions & 5 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ def test_configuration_reload(self, trigger_configuration_reload, *unused):
def test_configuration_reload_success(self, trigger_configuration_reload, *unused):
trigger_configuration_reload.return_value = True
self.harness.update_config({"evaluation_interval": "1234m"})
self.harness.evaluate_status()
self.assertIsInstance(self.harness.model.unit.status, ActiveStatus)

@k8s_resource_multipatch
Expand All @@ -224,6 +225,7 @@ def test_configuration_reload_success(self, trigger_configuration_reload, *unuse
def test_configuration_reload_error(self, trigger_configuration_reload, *unused):
trigger_configuration_reload.return_value = False
self.harness.update_config({"evaluation_interval": "1234m"})
self.harness.evaluate_status()
self.assertIsInstance(self.harness.model.unit.status, BlockedStatus)

@k8s_resource_multipatch
Expand All @@ -232,6 +234,7 @@ def test_configuration_reload_error(self, trigger_configuration_reload, *unused)
def test_configuration_reload_read_timeout(self, trigger_configuration_reload, *unused):
trigger_configuration_reload.return_value = "read_timeout"
self.harness.update_config({"evaluation_interval": "1234m"})
self.harness.evaluate_status()
self.assertIsInstance(self.harness.model.unit.status, MaintenanceStatus)


Expand Down Expand Up @@ -287,23 +290,61 @@ def test_default_maximum_retention_size_is_80_percent(self, *unused):
plan = self.harness.get_container_pebble_plan("prometheus")
self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB")

# AND WHEN the config option is set and then unset
self.harness.update_config({"maximum_retention_size": "50%"})
self.harness.update_config(unset={"maximum_retention_size"})

# THEN the pebble plan is back to 80%
plan = self.harness.get_container_pebble_plan("prometheus")
self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.8GB")

@k8s_resource_multipatch
@patch("lightkube.core.client.GenericSyncClient")
def test_multiplication_factor_applied_to_pvc_capacity(self, *unused):
"""The `--storage.tsdb.retention.size` arg must be multiplied by maximum_retention_size."""
# GIVEN a capacity limit in binary notation (k8s notation)
self.mock_capacity.return_value = "1Gi"

# AND a multiplication factor as a config option
self.harness.update_config({"maximum_retention_size": "50%"})

# WHEN the charm starts
self.harness.begin_with_initial_hooks()
self.harness.container_pebble_ready("prometheus")

# THEN the pebble plan the adjusted capacity
for set_point, read_back in [("0%", "0GB"), ("50%", "0.5GB"), ("100%", "1GB")]:
with self.subTest(limit=set_point):
# WHEN a limit is set
self.harness.update_config({"maximum_retention_size": set_point})

# THEN the pebble plan the adjusted capacity
plan = self.harness.get_container_pebble_plan("prometheus")
self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), read_back)

@k8s_resource_multipatch
@patch("lightkube.core.client.GenericSyncClient")
def test_invalid_retention_size_config_option_string(self, *unused):
# GIVEN a running charm with default values
self.mock_capacity.return_value = "1Gi"
self.harness.begin_with_initial_hooks()
self.harness.container_pebble_ready("prometheus")
self.harness.evaluate_status()
self.assertIsInstance(self.harness.model.unit.status, ActiveStatus)

# WHEN the config option is set to an invalid string
self.harness.update_config({"maximum_retention_size": "42"})

# THEN cli arg is unspecified and the unit is blocked
plan = self.harness.get_container_pebble_plan("prometheus")
self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.5GB")
self.assertIsNone(cli_arg(plan, "--storage.tsdb.retention.size"))
self.harness.evaluate_status()
self.assertIsInstance(self.harness.model.unit.status, BlockedStatus)

# AND WHEN the config option is corrected
self.harness.update_config({"maximum_retention_size": "42%"})

# THEN cli arg is updated and the unit is goes back to active
plan = self.harness.get_container_pebble_plan("prometheus")
self.assertEqual(cli_arg(plan, "--storage.tsdb.retention.size"), "0.42GB")
self.harness.evaluate_status()
self.assertIsInstance(self.harness.model.unit.status, ActiveStatus)


@prom_multipatch
Expand Down Expand Up @@ -654,6 +695,7 @@ def test_ca_file(self, *_):
},
)

self.harness.evaluate_status()
self.assertIsInstance(self.harness.model.unit.status, ActiveStatus)
container = self.harness.charm.unit.get_container("prometheus")
self.assertEqual(container.pull("/etc/prometheus/job1-ca.crt").read(), "CA 1")
Expand Down Expand Up @@ -690,6 +732,7 @@ def test_no_tls_config(self, *_):
},
)

self.harness.evaluate_status()
self.assertIsInstance(self.harness.model.unit.status, ActiveStatus)

@k8s_resource_multipatch
Expand Down Expand Up @@ -725,6 +768,7 @@ def test_tls_config_missing_cert(self, *_):
},
)

self.harness.evaluate_status()
self.assertIsInstance(self.harness.model.unit.status, BlockedStatus)

@k8s_resource_multipatch
Expand Down Expand Up @@ -760,4 +804,5 @@ def test_tls_config_missing_key(self, *_):
},
)

self.harness.evaluate_status()
self.assertIsInstance(self.harness.model.unit.status, BlockedStatus)
2 changes: 2 additions & 0 deletions tests/unit/test_charm_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def test_unit_is_active_if_deployed_without_relations_or_config(self, *unused):
# WHEN no config is provided or relations created

# THEN the unit goes into active state
self.harness.evaluate_status()
self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus)

# AND pebble plan is not empty
Expand Down Expand Up @@ -90,6 +91,7 @@ def test_unit_is_blocked_if_reload_configuration_fails(self, *unused):
# WHEN no config is provided or relations created

# THEN the unit goes into blocked state
self.harness.evaluate_status()
self.assertIsInstance(self.harness.charm.unit.status, BlockedStatus)

# AND pebble plan is not empty
Expand Down
Loading

0 comments on commit 4ca83b6

Please sign in to comment.