Skip to content

Commit

Permalink
k3s: automated maintenance
Browse files Browse the repository at this point in the history
- Add fc-kubernetes agent command to be used manually and in maintenance
  enter/leave commands.
- Only allow one k3s-agent in maintenance at the same time
- Drain nodes before running maintenance request.

PL-131525
  • Loading branch information
dpausp committed Nov 11, 2023
1 parent 0b121ca commit 10e37b8
Show file tree
Hide file tree
Showing 10 changed files with 687 additions and 51 deletions.
8 changes: 8 additions & 0 deletions nixos/platform/agent.nix
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ let
}
compdef _fc_slurm_completion fc-slurm
#compdef fc-kubernetes
_fc_kubernetes_completion() {
eval $(env _TYPER_COMPLETE_ARGS="''${words[1,$CURRENT]}" _FC_KUBERNETES_COMPLETE=complete_zsh fc-kubernetes)
}
compdef _fc_kubernetes_completion fc-kubernetes
'';

agentZshCompletionsPkg = pkgs.runCommand "agent-zshcomplete" {} ''
Expand Down
24 changes: 24 additions & 0 deletions nixos/roles/k3s/agent.nix
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ let
cfg = config.flyingcircus.roles.k3s-agent;
fclib = config.fclib;
server = fclib.findOneService "k3s-server-server";
agents = fclib.findServices "k3s-agent-agent";
agentNames = map (service: head (lib.splitString "." service.address)) agents;
otherAgentNames = filter (m: m != config.networking.hostName) agentNames;
serverAddress = lib.replaceStrings ["gocept.net"] ["fcio.net"] server.address or "";
agentAddress = head fclib.network.srv.v4.addresses;
tokenFile = "/var/lib/k3s/secret_token";
Expand Down Expand Up @@ -46,6 +49,27 @@ in
nfs-utils
];

flyingcircus.agent = {
maintenance.k3s-agent = {
# Move pods to other nodes before starting maintenance activities.
enter =
let
nodeArgs = lib.concatMapStrings (u: " --in-service ${u}") otherAgentNames;
script = pkgs.writeScript "k3s-agent-enter-maintenance" ''
set -e
# Check if all other agents are in service, signal "tempfail" otherwise.
fc-maintenance -v constraints --failure-exit-code 75 ${nodeArgs}
fc-kubernetes -v drain --reason fc-agent-maintenance
'';
in "${script}";

# Uncordon node at the end of maintenance request execution.
leave = ''
fc-kubernetes -v ready --label-must-match fc-agent-maintenance
'';
};
};

flyingcircus.services.telegraf.inputs = {
kubernetes = [{
# Works without auth on localhost.
Expand Down
50 changes: 31 additions & 19 deletions nixos/roles/k3s/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,39 @@
};
};

config = {
config =
let
server = config.flyingcircus.roles.k3s-server.enable;
agent = config.flyingcircus.roles.k3s-agent.enable;
frontend = config.flyingcircus.roles.webgateway.enable;
in
lib.mkMerge [
{
assertions =
[
{
assertion = !(server && agent);
message = "The k3s-agent role must not be enabled together with the k3s-server role.";
}
{
assertion = !(server && frontend);
message = "The k3s-server role must not be enabled together with the webgateway (activates kubernetes frontend) role.";
}
{
assertion = !(agent && frontend);
message = "The k3s-agent role must not be enabled together with the webgateway (activates kubernetes frontend) role.";
}
];

assertions =
let server = config.flyingcircus.roles.k3s-server.enable;
agent = config.flyingcircus.roles.k3s-agent.enable;
frontend = config.flyingcircus.roles.webgateway.enable;
in
[
{
assertion = !(server && agent);
message = "The k3s-agent role must not be enabled together with the k3s-server role.";
}
{
assertion = !(server && frontend);
message = "The k3s-server role must not be enabled together with the webgateway (activates kubernetes frontend) role.";
}
}

(lib.mkIf (server || agent) {
flyingcircus.passwordlessSudoRules = [
{
assertion = !(agent && frontend);
message = "The k3s-agent role must not be enabled together with the webgateway (activates kubernetes frontend) role.";
commands = [ "${pkgs.fc.agent}/bin/fc-kubernetes" ];
groups = [ "admins" "sudo-srv" ];
}
];
};

})
];
}
138 changes: 138 additions & 0 deletions pkgs/fc/agent/fc/manage/kubernetes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import os
import socket
from pathlib import Path
from typing import NamedTuple, Optional

import fc.util.kubernetes
import structlog
from fc.maintenance.state import EXIT_TEMPFAIL
from fc.util.directory import directory_connection
from fc.util.logging import init_logging
from fc.util.typer_utils import FCTyperApp
from rich import print
from typer import Exit, Option, Typer


class Context(NamedTuple):
logdir: Path
verbose: bool
enc_path: Path


app = FCTyperApp("fc-kubernetes")
context: Context


@app.callback(no_args_is_help=True)
def fc_kubernetes(
verbose: bool = Option(
False,
"--verbose",
"-v",
help="Show debug messages and code locations.",
),
logdir: Path = Option(
exists=True,
writable=True,
file_okay=False,
default="/var/log",
help="Directory for log files, expects a fc-agent subdirectory there.",
),
enc_path: Path = Option(
dir_okay=False,
default="/etc/nixos/enc.json",
help="Path to enc.json",
),
):
global context

context = Context(
logdir=logdir,
verbose=verbose,
enc_path=enc_path,
)

init_logging(verbose, logdir, syslog_identifier="fc-kubernetes")


@app.command(
help="Drain this node and wait for completion",
)
def drain(
timeout: int = Option(
default=300, help="Timeout in seconds passed to kubectl drain."
),
reason: str = Option(
default="fc-kubernetes-drain",
help=(
"Set a node label before draining. Labels can only contain "
"alphanumeric characters and '-', '_' or '.', and must start and "
"end with an alphanumeric character."
),
),
strict_state_check: Optional[bool] = False,
):
log = structlog.get_logger()
hostname = socket.gethostname()
try:
fc.util.kubernetes.drain(
log, hostname, timeout, reason, strict_state_check
)
except fc.util.kubernetes.NodeDrainTimeout:
raise Exit(EXIT_TEMPFAIL)


@app.command()
def ready(
strict_state_check: Optional[bool] = False,
label_must_match: Optional[str] = Option(
default=None,
help="Only set nodes to ready which match a given label.",
),
):
log = structlog.get_logger()
hostname = socket.gethostname()
fc.util.kubernetes.uncordon(
log, hostname, strict_state_check, label_must_match
)


all_nodes_app = Typer(
pretty_exceptions_show_locals=False,
help="Commands that affect all nodes in the cluster",
no_args_is_help=True,
)
app.add_typer(all_nodes_app, name="all-nodes")


@all_nodes_app.command(
name="ready",
help="Mark nodes as ready",
)
def ready_all(
strict_state_check: Optional[bool] = False,
label_must_match: Optional[str] = Option(
default=None,
help="Only set nodes to ready which match a given label.",
),
skip_nodes_in_maintenance: Optional[bool] = Option(
default=True,
help="Check maintenance state of nodes and skip when not in service.",
),
):
log = structlog.get_logger()
node_names = fc.util.kubernetes.get_all_agent_node_names()
with directory_connection(context.enc_path) as directory:
for node_name in node_names:
fc.util.kubernetes.uncordon(
log,
node_name,
strict_state_check,
label_must_match,
skip_nodes_in_maintenance,
directory,
)


if __name__ == "__main__":
app()
Loading

0 comments on commit 10e37b8

Please sign in to comment.