From c7a6c9706a9fe70722a8cf7689939f0c35900a3f Mon Sep 17 00:00:00 2001 From: Enzo Aguado Date: Tue, 8 Sep 2020 16:37:21 +0200 Subject: [PATCH] add NRPE checks This commit introduces a separate library for the NRPE client which is influenced by the charmhelpers nrpe client. Only a basic TCP check is performed at the moment to ensure nats is up and running --- config.yaml | 16 +++++ lib/nrpe/client.py | 148 +++++++++++++++++++++++++++++++++++++++++++++ metadata.yaml | 3 + src/charm.py | 21 ++++++- 4 files changed, 185 insertions(+), 3 deletions(-) create mode 100644 lib/nrpe/client.py diff --git a/config.yaml b/config.yaml index 77bfe77..d2512d9 100644 --- a/config.yaml +++ b/config.yaml @@ -51,3 +51,19 @@ options: type: string default: stable description: Snap Store channel to install the NATs snap from + nagios_context: + default: "juju" + type: string + description: | + Used by the nrpe subordinate charms. + A string that will be prepended to instance name to set the host name + in nagios. So for instance the hostname would be something like: + juju-myservice-0 + If you're running multiple environments with the same services in them + this allows you to differentiate between them. + nagios_servicegroups: + default: "" + type: string + description: | + A comma-separated list of nagios servicegroups. + If left empty, the nagios_context will be used as the servicegroup \ No newline at end of file diff --git a/lib/nrpe/client.py b/lib/nrpe/client.py new file mode 100644 index 0000000..15768e6 --- /dev/null +++ b/lib/nrpe/client.py @@ -0,0 +1,148 @@ +import logging +import os +import subprocess +from typing import List + +import yaml + +from ops.framework import EventBase, EventSource, EventsBase, StoredState +from ops.framework import Object + +logger = logging.getLogger(__name__) + +class NRPEAvailable(EventBase): + pass + + +class NRPEClientEvents(EventsBase): + nrpe_available = EventSource(NRPEAvailable) + + +class NRPEClient(Object): + on = NRPEClientEvents() + state = StoredState() + + nrpe_confdir = '/etc/nagios/nrpe.d' + nagios_exportdir = '/var/lib/nagios/export' + check_template = """ +#--------------------------------------------------- +# This file is Juju managed +#--------------------------------------------------- +command[%(check_name)s]=%(command)s +""" + service_template = (""" +#--------------------------------------------------- +# This file is Juju managed +#--------------------------------------------------- +define service { + use active-service + host_name %(hostname)s + service_description %(hostname)s[%(check_name)s] """ + """%(description)s + check_command check_nrpe!%(check_name)s + servicegroups %(servicegroup)s +} +""") + + def __init__(self, charm, relation_name='nrpe-external-master'): + super().__init__(charm, relation_name) + self._relation_name = relation_name + self.state.set_default(checks={}, dirty=False, nrpe_ready=False) + + self.framework.observe(charm.on[relation_name].relation_changed, self.on_relation_changed) + + @property + def is_joined(self): + return self.framework.model.get_relation(self._relation_name) is not None + + @property + def is_available(self): + return self.state.nrpe_ready + + def add_check(self, command: List[str], name: str, description: str = None, hostname: str = None): + """ + Register a new check to be executed by NRPE. + Call NRPEClient.commit() to save changes. + If a check with the same name already exists, it will by updated. + :param command: A string array containing the command to be executed + :param name: Human readable name for the check + :param description: A short description of the check + :param hostname: Unit hostname. Defaults to a combination of nagios_context and unit name + """ + nagios_context = self.model.config['nagios_context'] + nagios_servicegroups = self.model.config.get('nagios_servicegroups') or nagios_context + unit_name = self.model.unit.name.replace("/", "_") + hostname = hostname or f"{nagios_context}-{unit_name}" + if not description: + description = f'{name} {unit_name}' + + new_check = { + 'command': command, + 'description': description, + 'hostname': hostname, + 'servicegroup': nagios_servicegroups, + } + + if name not in self.state.checks or self.state.checks[name] != new_check: + self.state.dirty = True + self.state.checks[name] = new_check + + def remove_check(self, name: str): + self.state.checks.pop(name, None) + + def commit(self): + """Commit checks to NRPE and Nagios""" + if not self.state.dirty: + logger.info('Skipping NRPE commit as nothing changed') + return + + if not self.state.nrpe_ready: + logger.info('NRPE relation is not ready') + return + + self._write_check_files() + self._publish_to_nagios() + subprocess.check_call(['systemctl', 'restart', 'nagios-nrpe-server']) + self.state.dirty = False + logger.info(f'Successfully updated NRPE checks: {", ".join(c for c in self.state.checks)}') + + def _write_check_files(self): + """Register the new checks with NRPE and place their configuration files in the appropriate locations""" + for check_name in self.state.checks: + check = self.state.checks[check_name] + + check_filename = os.path.join(self.nrpe_confdir, f'{check_name}.cfg') + check_args = { + 'check_name': check_name, + 'command': ' '.join(check['command']) + } + with open(check_filename, 'w') as check_config: + check_config.write(self.check_template % check_args) + + service_filename = os.path.join(self.nagios_exportdir, 'service__{}_{}.cfg'.format(check['hostname'], check_name)) + service_args = { + 'hostname': check['hostname'], + 'description': check['description'], + 'check_name': check_name, + 'servicegroup': check['servicegroup'] + } + with open(service_filename, 'w') as service_config: + service_config.write(self.service_template % service_args) + + def _publish_to_nagios(self): + """Publish check data on the monitors relation""" + rel = self.framework.model.get_relation(self._relation_name) + rel_data = rel.data[self.model.unit] + rel_data['version'] = '0.3' + + nrpe_monitors = {} + for check_name in self.state.checks: + nrpe_monitors[check_name] = {'command': check_name} + + rel_data['monitors'] = yaml.dump({"monitors": {"remote": {"nrpe": nrpe_monitors}}}) + + def on_relation_changed(self, event): + if not self.state.nrpe_ready: + self.state.nrpe_ready = True + self.on.nrpe_available.emit() + diff --git a/metadata.yaml b/metadata.yaml index bce6033..c3365fb 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -14,6 +14,9 @@ series: provides: client: interface: nats + nrpe-external-master: + interface: nrpe-external-master + scope: container requires: ca-client: interface: tls-certificates diff --git a/src/charm.py b/src/charm.py index fbea633..e83c88c 100755 --- a/src/charm.py +++ b/src/charm.py @@ -18,6 +18,7 @@ ModelError, BlockedStatus, ) +from nrpe.client import NRPEClient from interfaces import NatsCluster, NatsClient, CAClient from jinja2 import Environment, FileSystemLoader @@ -74,6 +75,9 @@ def __init__(self, framework, key): self.framework.observe(self.ca_client.on.tls_config_ready, self) self.framework.observe(self.ca_client.on.ca_available, self) + self.nrpe_client = NRPEClient(self, 'nrpe-external-master') + self.framework.observe(self.nrpe_client.on.nrpe_available, self) + def on_install(self, event): try: core_res = self.model.resources.fetch('core') @@ -111,10 +115,10 @@ def handle_tls_config(self): load_pem_private_key(tls_key, backend=default_backend()) tls_cert = self.model.config['tls-cert'] if tls_cert: - load_pem_x509_certificate(tls_cert, bacend=default_backend()) + load_pem_x509_certificate(tls_cert, backend=default_backend()) tls_ca_cert = self.model.config['tls-ca-cert'] if tls_ca_cert: - load_pem_x509_certificate(tls_ca_cert, default_backend()) + load_pem_x509_certificate(tls_ca_cert, backend=default_backend()) self.state.use_tls = tls_key and tls_cert self.state.use_tls_ca = bool(tls_ca_cert) @@ -129,6 +133,9 @@ def handle_tls_config(self): self.TLS_CA_CERT_PATH.write_text(tls_ca_cert) self.client.set_tls_ca(tls_ca_cert) + def on_nrpe_available(self, event): + self.reconfigure_nats() + def on_ca_available(self, event): self.reconfigure_nats() @@ -206,6 +213,15 @@ def reconfigure_nats(self): }) self.client.set_tls_ca( self.ca_client.ca_certificate.public_bytes(encoding=serialization.Encoding.PEM).decode('utf-8')) + + if self.nrpe_client.is_available: + self.nrpe_client.add_check(command=[ + '/usr/lib/nagios/plugins/check_tcp', + '-H', str(self.client.listen_address), + '-p', str(self.model.config['client-port']) + ], name='check_tcp') + self.nrpe_client.commit() + tenv = Environment(loader=FileSystemLoader('templates')) template = tenv.get_template('nats.cfg.j2') rendered_content = template.render(ctxt) @@ -213,7 +229,6 @@ def reconfigure_nats(self): old_hash = self.state.nats_config_hash if old_hash != content_hash: logging.info(f'Config has changed - re-rendering a template to {self.NATS_SERVER_CONFIG_PATH}') - logger.info('') self.state.nats_config_hash = content_hash self.NATS_SERVER_CONFIG_PATH.write_text(rendered_content) if self.state.is_started: