diff --git a/CHANGELOG.md b/CHANGELOG.md index 60dfdf2..6e032b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.6.0] - 2023-11-30 + - Added badges to README.md (#62). +- Config now accommodates client ID, key, and name, allowing users to specify individual client details (#63). +- Added client authentication using SHA512-hashed keys for enhanced security (#63). + ## [0.5.0] - 2023-10-26 diff --git a/README.md b/README.md index b67fecd..e6f7e0c 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ receivers: ... - name: cos-alerter webhook_configs: - - url: http://:8080/alive?clientid= + - url: http://:8080/alive?clientid=&key= route: ... routes: diff --git a/cos_alerter/alerter.py b/cos_alerter/alerter.py index 7acf7bb..401380f 100644 --- a/cos_alerter/alerter.py +++ b/cos_alerter/alerter.py @@ -6,6 +6,7 @@ import datetime import logging import os +import sys import textwrap import threading import time @@ -14,7 +15,8 @@ import apprise import durationpy -import yaml +from ruamel.yaml import YAML +from ruamel.yaml.constructor import DuplicateKeyError logger = logging.getLogger(__name__) @@ -30,14 +32,35 @@ def set_path(self, path: str): """Set the config file path.""" self.path = Path(path) + def _validate_hashes(self, clients): + """Validate that keys in the clients dictionary are valid SHA-512 hashes.""" + for client_info in clients.values(): + client_key = client_info.get("key", "") + is_valid = len(client_key) == 128 + if client_key and not is_valid: + return False + return True + def reload(self): """Reload config values from the disk.""" + yaml = YAML(typ="rt") with open( os.path.join(os.path.dirname(os.path.realpath(__file__)), "config-defaults.yaml") ) as f: - self.data = yaml.safe_load(f) + self.data = yaml.load(f) with open(self.path, "r") as f: - user_data = yaml.safe_load(f) + try: + user_data = yaml.load(f) + except DuplicateKeyError: + logger.critical("Duplicate client IDs found in COS Alerter config. Exiting...") + sys.exit(1) + + # Validate that keys are valid SHA-512 hashes + if user_data and user_data.get("watch", {}).get("clients"): + if not self._validate_hashes(user_data["watch"]["clients"]): + logger.critical("Invalid SHA-512 hash(es) in config. Exiting...") + sys.exit(1) + deep_update(self.data, user_data) self.data["watch"]["down_interval"] = durationpy.from_str( self.data["watch"]["down_interval"] @@ -50,15 +73,15 @@ def reload(self): def deep_update(base: dict, new: typing.Optional[dict]): """Deep dict update. - Same as dict.update() except it recurses into dubdicts. + Same as dict.update() except it recurses into subdicts. """ if new is None: return - for key in base: - if key in new and isinstance(base[key], dict): - deep_update(base[key], new[key]) - elif key in new: - base[key] = new[key] + for key, new_value in new.items(): + if key in base and isinstance(base[key], dict) and isinstance(new_value, dict): + deep_update(base[key], new_value) + else: + base[key] = new_value config = Config() @@ -120,9 +143,9 @@ def initialize(): # ... # } state["clients"] = {} - for client in config["watch"]["clients"]: + for client_id in config["watch"]["clients"]: alert_time = None if config["watch"]["wait_for_first_connection"] else current_time - state["clients"][client] = { + state["clients"][client_id] = { "lock": threading.Lock(), "alert_time": alert_time, "notify_time": None, diff --git a/cos_alerter/config-defaults.yaml b/cos_alerter/config-defaults.yaml index 6e81a03..936e04f 100644 --- a/cos_alerter/config-defaults.yaml +++ b/cos_alerter/config-defaults.yaml @@ -8,13 +8,19 @@ watch: # This allows you to configure COS Alerter before configuring Alertmanager. wait_for_first_connection: true - # The list of Alertmanager instances we are monitoring. Alertmanager instances should be - # configured with the clientid= parameter. + # Configuration for monitoring Alertmanager instances. + # - clientid: Unique identifier for the Alertmanager instance. + # - key: Secret key for authenticating and authorizing communication with COS Alerter. (Should be a SHA512 hash) + # - name: Descriptive name for the instance. # eg: # clients: - # - "client0" - # - "client1" - clients: [] + # clientid0: + # key: "822295b207a0b73dd4690b60a03c55599346d44aef3da4cf28c3296eadb98b2647ae18863cc3ae8ae5574191b60360858982fd8a8d176c0edf646ce6eee24ef9" + # name: "Instance Name 0" + # clientid1: + # key: "0415b0cad09712bd1ed094bc06ed421231d0603465e9841c959e9f9dcf735c9ce704df7a0c849a4e0db405c916f679a0e6c3f63f9e26191dda8069e1b44a3bc8" + # name: "Instance Name 1" + clients: {} notify: diff --git a/cos_alerter/server.py b/cos_alerter/server.py index ac41e6d..398d40b 100644 --- a/cos_alerter/server.py +++ b/cos_alerter/server.py @@ -3,6 +3,7 @@ """HTTP server for COS Alerter.""" +import hashlib import logging import timeago @@ -28,9 +29,10 @@ def dashboard(): status = "up" if not state.is_down() else "down" if last_alert is None: status = "unknown" + client_name = config["watch"]["clients"][clientid].get("name", "") clients.append( { - "clientid": clientid, + "client_name": client_name, "status": status, "alert_time": alert_time, } @@ -44,16 +46,28 @@ def alive(): # TODO Decide if we should validate the request. params = request.args clientid_list = params.getlist("clientid") # params is a werkzeug.datastructures.MultiDict - if len(clientid_list) < 1: - logger.warning("Request %s has no clientid.", request.url) - return 'Parameter "clientid" required.', 400 - if len(clientid_list) > 1: - logger.warning("Request %s specified clientid more than once.", request.url) - return 'Parameter "clientid" provided more than once.', 400 + key_list = params.getlist("key") + + if len(clientid_list) < 1 or len(key_list) < 1: + logger.warning("Request %s is missing clientid or key.", request.url) + return 'Parameters "clientid" and "key" are required.', 400 + if len(clientid_list) > 1 or len(key_list) > 1: + logger.warning("Request %s specified clientid or key more than once.", request.url) + return 'Parameters "clientid" and "key" should be provided exactly once.', 400 clientid = clientid_list[0] - if clientid not in config["watch"]["clients"]: - logger.warning("Request %s specified an unknown clientid.") + key = key_list[0] + + # Find the client with the specified clientid + client_info = config["watch"]["clients"].get(clientid) + if not client_info: + logger.warning("Request %s specified an unknown clientid.", request.url) return 'Clientid {params["clientid"]} not found. ', 404 + + # Hash the key and compare with the stored hashed key + hashed_key = hashlib.sha512(key.encode()).hexdigest() + if hashed_key != client_info.get("key", ""): + logger.warning("Request %s provided an incorrect key.", request.url) + return "Incorrect key for the specified clientid.", 401 logger.info("Received alert from Alertmanager clientid: %s.", clientid) with AlerterState(clientid) as state: state.reset_alert_timeout() diff --git a/cos_alerter/templates/dashboard.html b/cos_alerter/templates/dashboard.html index cb0ba62..579d076 100644 --- a/cos_alerter/templates/dashboard.html +++ b/cos_alerter/templates/dashboard.html @@ -41,7 +41,7 @@

Clients

{% for client in clients %} - {{ client["clientid"] }} + {{ client["client_name"] }} {% if client["status"] == "up" %} ✅ Up {% elif client["status"] == "down" %} diff --git a/pyproject.toml b/pyproject.toml index bafb5c8..512834f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "cos-alerter" -version = "0.5.0" +version = "0.6.0" authors = [ { name="Dylan Stephano-Shachter", email="dylan.stephano-shachter@canonical.com" } ] @@ -26,6 +26,7 @@ dependencies = [ "pyyaml~=6.0", "timeago~=1.0", "waitress~=2.1", + "ruamel.yaml~=0.18.0" ] [project.urls] diff --git a/rockcraft.yaml b/rockcraft.yaml index 9d445f9..fafe2fc 100644 --- a/rockcraft.yaml +++ b/rockcraft.yaml @@ -1,7 +1,7 @@ name: cos-alerter summary: A liveness checker for self-monitoring. description: Receive regular pings from the cos stack and alert when they stop. -version: "0.5.0" # NOTE: Make sure this matches `cos-alerter` below +version: "0.6.0" # NOTE: Make sure this matches `cos-alerter` below base: ubuntu:22.04 license: Apache-2.0 platforms: @@ -11,7 +11,7 @@ parts: plugin: python source: . python-packages: - - cos-alerter==0.5.0 # NOTE: Make sure this matches `version` above + - cos-alerter==0.6.0 # NOTE: Make sure this matches `version` above stage-packages: - python3-venv services: diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index 9a41063..eb27652 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -1,5 +1,5 @@ name: cos-alerter -version: '0.5.0' +version: '0.6.0' summary: A watchdog alerting on alertmanager notification failures. license: Apache-2.0 contact: simon.aronsson@canonical.com diff --git a/tests/helpers.py b/tests/helpers.py index 555ee19..8ae1d33 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -10,7 +10,12 @@ "watch": { "down_interval": "5m", "wait_for_first_connection": False, - "clients": ["client0"], + "clients": { + "clientid1": { + "key": "822295b207a0b73dd4690b60a03c55599346d44aef3da4cf28c3296eadb98b2647ae18863cc3ae8ae5574191b60360858982fd8a8d176c0edf646ce6eee24ef9", + "name": "Instance Name 1", + }, + }, }, "notify": { "destinations": DESTINATIONS, diff --git a/tests/test_alerter.py b/tests/test_alerter.py index 39763c9..d52c29a 100644 --- a/tests/test_alerter.py +++ b/tests/test_alerter.py @@ -29,6 +29,53 @@ def test_config_default_empty_file(fake_fs): assert config["watch"]["down_interval"] == 300 +def test_duplicate_key_error(fake_fs): + duplicate_config = """ + watch: + down_interval: "5m" + wait_for_first_connection: true + clients: + clientid1: + key: "clientkey1" + name: "Instance Name 1" + clientid1: + key: "clientkey1" + name: "Instance Name 1" + """ + with open("/etc/cos-alerter.yaml", "w") as f: + f.write(duplicate_config) + + try: + config.reload() + except SystemExit as exc: + assert exc.code == 1 + else: + # If no exception is raised, fail the test + assert False + + +def test_invalid_hashes(fake_fs): + duplicate_config = """ + watch: + down_interval: "5m" + wait_for_first_connection: true + clients: + invalidhashclient: + key: "E0E06B8DB6ED8DD4E1FFE98376E606BDF4FE4ABB4AF65BFE8B18FBFA6564D8B3" + name: "Instance Name 1" + """ + with open("/etc/cos-alerter.yaml", "w") as f: + f.write(duplicate_config) + + try: + config.reload() + except SystemExit as exc: + assert exc.code == 1 + else: + # If no exception is raised, fail the test + assert False + + def test_config_default_partial_file(fake_fs): conf = yaml.dump({"log_level": "info"}) with open("/etc/cos-alerter.yaml", "w") as f: @@ -50,7 +97,7 @@ def test_config_default_override(fake_fs): def test_initialize(monotonic_mock, fake_fs): monotonic_mock.return_value = 1000 AlerterState.initialize() - state = AlerterState(clientid="client0") + state = AlerterState(clientid="clientid1") with state: assert state.start_date == 1672531200.0 assert state.start_time == 1000 @@ -72,7 +119,7 @@ def test_up_time(monotonic_mock, fake_fs): def test_is_down_from_initialize(monotonic_mock, fake_fs): monotonic_mock.return_value = 1000 AlerterState.initialize() - state = AlerterState(clientid="client0") + state = AlerterState(clientid="clientid1") with state: monotonic_mock.return_value = 1180 # Three minutes have passed assert state.is_down() is False @@ -85,7 +132,7 @@ def test_is_down_from_initialize(monotonic_mock, fake_fs): def test_is_down_with_reset_alert_timeout(monotonic_mock, fake_fs): monotonic_mock.return_value = 1000 AlerterState.initialize() - state = AlerterState(clientid="client0") + state = AlerterState(clientid="clientid1") with state: monotonic_mock.return_value = 2000 state.reset_alert_timeout() @@ -106,7 +153,7 @@ def test_is_down_with_wait_for_first_connection(monotonic_mock, fake_fs): config.reload() monotonic_mock.return_value = 1000 AlerterState.initialize() - state = AlerterState(clientid="client0") + state = AlerterState(clientid="clientid1") with state: monotonic_mock.return_value = 1500 assert state.is_down() is False # 6 minutes have passes but we have not started counting. @@ -122,7 +169,7 @@ def test_is_down_with_wait_for_first_connection(monotonic_mock, fake_fs): def test_is_down(monotonic_mock, fake_fs): monotonic_mock.return_value = 1000 AlerterState.initialize() - state = AlerterState(clientid="client0") + state = AlerterState(clientid="clientid1") with state: monotonic_mock.return_value = 2000 state.reset_alert_timeout() @@ -137,7 +184,7 @@ def test_is_down(monotonic_mock, fake_fs): def test_recently_notified(monotonic_mock, fake_fs): monotonic_mock.return_value = 1000 AlerterState.initialize() - state = AlerterState(clientid="client0") + state = AlerterState(clientid="clientid1") with state: state._set_notify_time() monotonic_mock.return_value = 2800 # 30 minutes have passed @@ -153,7 +200,7 @@ def test_recently_notified(monotonic_mock, fake_fs): def test_notify(notify_mock, add_mock, monotonic_mock, fake_fs): monotonic_mock.return_value = 1000 AlerterState.initialize() - state = AlerterState(clientid="client0") + state = AlerterState(clientid="clientid1") with state: state.notify() @@ -166,7 +213,7 @@ def test_notify(notify_mock, add_mock, monotonic_mock, fake_fs): title="**Alertmanager is Down!**", body=textwrap.dedent( """ - Your Alertmanager instance: client0 seems to be down! + Your Alertmanager instance: clientid1 seems to be down! It has not alerted COS-Alerter ever. """ ), diff --git a/tests/test_daemon.py b/tests/test_daemon.py index d6e753b..ebffc27 100644 --- a/tests/test_daemon.py +++ b/tests/test_daemon.py @@ -29,7 +29,12 @@ def mock_fs(fake_fs): "watch": { "down_interval": "4s", "wait_for_first_connection": False, - "clients": ["client0"], + "clients": { + "clientid1": { + "key": "822295b207a0b73dd4690b60a03c55599346d44aef3da4cf28c3296eadb98b2647ae18863cc3ae8ae5574191b60360858982fd8a8d176c0edf646ce6eee24ef9", + "name": "Instance Name 1", + }, + }, }, "notify": { "destinations": DESTINATIONS, @@ -53,7 +58,14 @@ def test_main(notify_mock, add_mock, mock_fs): main_thread.start() time.sleep(2) # Should not be considered down yet. notify_mock.assert_not_called() - subprocess.call(["curl", "-X", "POST", "http://localhost:8080/alive?clientid=client0"]) + subprocess.call( + [ + "curl", + "-X", + "POST", + "http://localhost:8080/alive?clientid=clientid1&key=clientkey1", + ] + ) time.sleep(3) # Would be considered down but we just sent an alive call. notify_mock.assert_not_called() time.sleep(3) # It has been > 4 seconds since we last alerted so it should be down. diff --git a/tests/test_server.py b/tests/test_server.py index ad35a1a..df7544b 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -11,7 +11,7 @@ from cos_alerter.alerter import AlerterState, config from cos_alerter.server import app -PARAMS = {"clientid": "client0"} +PARAMS = {"clientid": "clientid1", "key": "clientkey1"} @pytest.fixture @@ -29,7 +29,9 @@ def test_dashboard_succeeds(flask_client, fake_fs, state_init): def test_alive_succeeds(flask_client, fake_fs, state_init): - assert flask_client.post("/alive", query_string=PARAMS).status_code == 200 + response = flask_client.post("/alive", query_string=PARAMS) + assert response.status_code == 200 + assert len(response.data) > 0 def test_alive_other_methods_fail(flask_client, fake_fs, state_init): @@ -42,7 +44,7 @@ def test_alive_other_methods_fail(flask_client, fake_fs, state_init): def test_alive_updates_time(flask_client, fake_fs, state_init): flask_client.post("/alive", query_string=PARAMS) - state = AlerterState(clientid="client0") + state = AlerterState(clientid="clientid1") with state: assert state.data["alert_time"] > state.start_time @@ -52,23 +54,60 @@ def test_metrics_succeeds(flask_client, fake_fs, state_init): def test_no_clientid(flask_client, fake_fs, state_init): - assert flask_client.post("/alive").status_code == 400 + response = flask_client.post("/alive") + assert response.status_code == 400 + assert len(response.data) > 0 def test_wrong_clientid(flask_client, fake_fs, state_init): - assert flask_client.post("/alive", query_string={"clientid": "client1"}).status_code == 404 + response = flask_client.post( + "/alive", + query_string={"clientid": "clientid2", "key": "clientkey1"}, + ) + assert response.status_code == 404 def test_duplicate_clientid(flask_client, fake_fs, state_init): conf = copy.deepcopy(CONFIG) - conf["watch"]["clients"].append("client1") + conf["watch"]["clients"]["clientid2"] = { + "key": "0415b0cad09712bd1ed094bc06ed421231d0603465e9841c959e9f9dcf735c9ce704df7a0c849a4e0db405c916f679a0e6c3f63f9e26191dda8069e1b44a3bc8", + "name": "Client 2", + } with open("/etc/cos-alerter.yaml", "w") as f: f.write(yaml.dump(conf)) config.reload() params = MultiDict( [ - ("clientid", "client0"), - ("clientid", "client1"), + ("clientid", "clientid1"), + ("key", "clientkey1"), + ("clientid", "clientid2"), + ("key", "clientkey2"), ] ) - assert flask_client.post("/alive", query_string=params).status_code == 400 + response = flask_client.post("/alive", query_string=params) + assert response.status_code == 400 + assert len(response.data) > 0 + + +def test_invalid_key(flask_client, fake_fs, state_init): + response = flask_client.post( + "/alive", + query_string={"clientid": "clientid1", "key": "incorrect-key"}, + ) + assert response.status_code == 401 + assert len(response.data) > 0 + + +def test_missing_key(flask_client, fake_fs, state_init): + response = flask_client.post("/alive", query_string={"clientid": "clientid1"}) + assert response.status_code == 400 + assert len(response.data) > 0 + + +def test_multiple_key_values(flask_client, fake_fs, state_init): + response = flask_client.post( + "/alive", + query_string={"clientid": "clientid1", "key": ["key1", "key2"]}, + ) + assert response.status_code == 400 + assert len(response.data) > 0 diff --git a/tox.ini b/tox.ini index 7f1e633..cdc43c3 100644 --- a/tox.ini +++ b/tox.ini @@ -47,6 +47,7 @@ deps = pytest pyyaml werkzeug + ruamel.yaml commands = coverage run --source {[vars]src_path} -m pytest -m "not slow" -v --log-cli-level=INFO {[vars]tst_path} @@ -59,6 +60,7 @@ deps = pyfakefs pytest pyyaml + ruamel.yaml commands = coverage run -a --source {[vars]src_path} -m pytest -m slow -v --log-cli-level=INFO {[vars]tst_path}