Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NRPE checks #10

Merged
merged 1 commit into from
Sep 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,19 @@ options:
type: string
default: stable
description: Snap Store channel to install the NATs snap from
nagios_context:
default: "juju"
type: string
description: |
Used by the nrpe subordinate charms.
A string that will be prepended to instance name to set the host name
in nagios. So for instance the hostname would be something like:
juju-myservice-0
If you're running multiple environments with the same services in them
this allows you to differentiate between them.
nagios_servicegroups:
default: ""
type: string
description: |
A comma-separated list of nagios servicegroups.
If left empty, the nagios_context will be used as the servicegroup
148 changes: 148 additions & 0 deletions lib/nrpe/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import logging
import os
import subprocess
from typing import List

import yaml

from ops.framework import EventBase, EventSource, EventsBase, StoredState
from ops.framework import Object

logger = logging.getLogger(__name__)

class NRPEAvailable(EventBase):
pass


class NRPEClientEvents(EventsBase):
nrpe_available = EventSource(NRPEAvailable)


class NRPEClient(Object):
on = NRPEClientEvents()
state = StoredState()

nrpe_confdir = '/etc/nagios/nrpe.d'
nagios_exportdir = '/var/lib/nagios/export'
check_template = """
#---------------------------------------------------
# This file is Juju managed
#---------------------------------------------------
command[%(check_name)s]=%(command)s
"""
service_template = ("""
#---------------------------------------------------
# This file is Juju managed
#---------------------------------------------------
define service {
use active-service
host_name %(hostname)s
service_description %(hostname)s[%(check_name)s] """
"""%(description)s
check_command check_nrpe!%(check_name)s
servicegroups %(servicegroup)s
}
""")

def __init__(self, charm, relation_name='nrpe-external-master'):
super().__init__(charm, relation_name)
self._relation_name = relation_name
self.state.set_default(checks={}, dirty=False, nrpe_ready=False)

self.framework.observe(charm.on[relation_name].relation_changed, self.on_relation_changed)

@property
def is_joined(self):
return self.framework.model.get_relation(self._relation_name) is not None

@property
def is_available(self):
return self.state.nrpe_ready

def add_check(self, command: List[str], name: str, description: str = None, hostname: str = None):
"""
Register a new check to be executed by NRPE.
Call NRPEClient.commit() to save changes.
If a check with the same name already exists, it will by updated.
:param command: A string array containing the command to be executed
:param name: Human readable name for the check
:param description: A short description of the check
:param hostname: Unit hostname. Defaults to a combination of nagios_context and unit name
"""
nagios_context = self.model.config['nagios_context']
nagios_servicegroups = self.model.config.get('nagios_servicegroups') or nagios_context
unit_name = self.model.unit.name.replace("/", "_")
hostname = hostname or f"{nagios_context}-{unit_name}"
if not description:
description = f'{name} {unit_name}'

new_check = {
'command': command,
'description': description,
'hostname': hostname,
'servicegroup': nagios_servicegroups,
}

if name not in self.state.checks or self.state.checks[name] != new_check:
self.state.dirty = True
self.state.checks[name] = new_check

def remove_check(self, name: str):
self.state.checks.pop(name, None)

def commit(self):
"""Commit checks to NRPE and Nagios"""
if not self.state.dirty:
logger.info('Skipping NRPE commit as nothing changed')
return

if not self.state.nrpe_ready:
logger.info('NRPE relation is not ready')
return

self._write_check_files()
self._publish_to_nagios()
subprocess.check_call(['systemctl', 'restart', 'nagios-nrpe-server'])
self.state.dirty = False
logger.info(f'Successfully updated NRPE checks: {", ".join(c for c in self.state.checks)}')

def _write_check_files(self):
"""Register the new checks with NRPE and place their configuration files in the appropriate locations"""
for check_name in self.state.checks:
check = self.state.checks[check_name]

check_filename = os.path.join(self.nrpe_confdir, f'{check_name}.cfg')
check_args = {
'check_name': check_name,
'command': ' '.join(check['command'])
}
with open(check_filename, 'w') as check_config:
check_config.write(self.check_template % check_args)

service_filename = os.path.join(self.nagios_exportdir, 'service__{}_{}.cfg'.format(check['hostname'], check_name))
service_args = {
'hostname': check['hostname'],
'description': check['description'],
'check_name': check_name,
'servicegroup': check['servicegroup']
}
with open(service_filename, 'w') as service_config:
service_config.write(self.service_template % service_args)

def _publish_to_nagios(self):
"""Publish check data on the monitors relation"""
rel = self.framework.model.get_relation(self._relation_name)
rel_data = rel.data[self.model.unit]
rel_data['version'] = '0.3'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does the version mean? Is it just static or do we have to take it from somewhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the version spec. It's not really used but doesn't hurt to specify

see https://git.launchpad.net/charm-nrpe/tree/README.md#n120


nrpe_monitors = {}
for check_name in self.state.checks:
nrpe_monitors[check_name] = {'command': check_name}

rel_data['monitors'] = yaml.dump({"monitors": {"remote": {"nrpe": nrpe_monitors}}})

def on_relation_changed(self, event):
if not self.state.nrpe_ready:
self.state.nrpe_ready = True
self.on.nrpe_available.emit()

3 changes: 3 additions & 0 deletions metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ series:
provides:
client:
interface: nats
nrpe-external-master:
interface: nrpe-external-master
scope: container
requires:
ca-client:
interface: tls-certificates
Expand Down
21 changes: 18 additions & 3 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
ModelError,
BlockedStatus,
)
from nrpe.client import NRPEClient
from interfaces import NatsCluster, NatsClient, CAClient

from jinja2 import Environment, FileSystemLoader
Expand Down Expand Up @@ -74,6 +75,9 @@ def __init__(self, framework, key):
self.framework.observe(self.ca_client.on.tls_config_ready, self)
self.framework.observe(self.ca_client.on.ca_available, self)

self.nrpe_client = NRPEClient(self, 'nrpe-external-master')
self.framework.observe(self.nrpe_client.on.nrpe_available, self)

def on_install(self, event):
try:
core_res = self.model.resources.fetch('core')
Expand Down Expand Up @@ -111,10 +115,10 @@ def handle_tls_config(self):
load_pem_private_key(tls_key, backend=default_backend())
tls_cert = self.model.config['tls-cert']
if tls_cert:
load_pem_x509_certificate(tls_cert, bacend=default_backend())
load_pem_x509_certificate(tls_cert, backend=default_backend())
tls_ca_cert = self.model.config['tls-ca-cert']
if tls_ca_cert:
load_pem_x509_certificate(tls_ca_cert, default_backend())
load_pem_x509_certificate(tls_ca_cert, backend=default_backend())

self.state.use_tls = tls_key and tls_cert
self.state.use_tls_ca = bool(tls_ca_cert)
Expand All @@ -129,6 +133,9 @@ def handle_tls_config(self):
self.TLS_CA_CERT_PATH.write_text(tls_ca_cert)
self.client.set_tls_ca(tls_ca_cert)

def on_nrpe_available(self, event):
self.reconfigure_nats()

def on_ca_available(self, event):
self.reconfigure_nats()

Expand Down Expand Up @@ -206,14 +213,22 @@ def reconfigure_nats(self):
})
self.client.set_tls_ca(
self.ca_client.ca_certificate.public_bytes(encoding=serialization.Encoding.PEM).decode('utf-8'))

if self.nrpe_client.is_available:
self.nrpe_client.add_check(command=[
'/usr/lib/nagios/plugins/check_tcp',
'-H', str(self.client.listen_address),
'-p', str(self.model.config['client-port'])
], name='check_tcp')
self.nrpe_client.commit()

tenv = Environment(loader=FileSystemLoader('templates'))
template = tenv.get_template('nats.cfg.j2')
rendered_content = template.render(ctxt)
content_hash = self.generate_content_hash(rendered_content)
old_hash = self.state.nats_config_hash
if old_hash != content_hash:
logging.info(f'Config has changed - re-rendering a template to {self.NATS_SERVER_CONFIG_PATH}')
logger.info('')
self.state.nats_config_hash = content_hash
self.NATS_SERVER_CONFIG_PATH.write_text(rendered_content)
if self.state.is_started:
Expand Down