Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use sackd for the login nodes #3126

Merged
merged 2 commits into from
Oct 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
# This file is managed by a script. Manual modifications will be overwritten.
"""

login_nodeset = "x-login"


def dict_to_conf(conf, delim=" ") -> str:
Expand Down Expand Up @@ -130,24 +129,6 @@ def get(key, default):
return dict_to_conf(conf_options, delim="\n")


def loginlines() -> str:
nodeset = {
"NodeSet": login_nodeset,
"Feature": login_nodeset,
}
partition = {
"PartitionName": login_nodeset,
"Nodes": login_nodeset,
"State": "UP",
"DefMemPerCPU": 1,
"Hidden": "YES",
"RootOnly": "YES",
}
lines = [
dict_to_conf(nodeset),
dict_to_conf(partition),
]
return "\n".join(lines)


def nodeset_lines(nodeset, lkp: util.Lookup) -> str:
Expand Down Expand Up @@ -254,7 +235,7 @@ def suspend_exc_lines(lkp: util.Lookup) -> Iterable[str]:
for p in lkp.cfg.partitions.values()
if len(p.partition_nodeset_dyn) > 0
]
suspend_exc_parts = {"SuspendExcParts": [login_nodeset, *dyn_parts]}
suspend_exc_parts = {"SuspendExcParts": [*dyn_parts]}

return filter(
None,
Expand All @@ -270,7 +251,6 @@ def make_cloud_conf(lkp: util.Lookup) -> str:
lines = [
FILE_PREAMBLE,
conflines(lkp),
loginlines(),
*(nodeset_lines(n, lkp) for n in lkp.cfg.nodeset.values()),
*(nodeset_dyn_lines(n) for n in lkp.cfg.nodeset_dyn.values()),
*(nodeset_tpu_lines(n, lkp) for n in lkp.cfg.nodeset_tpu.values()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ def setup_sudoers():
content = """
# Allow SlurmUser to manage the slurm daemons
slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmd.service
slurm ALL= NOPASSWD: /usr/bin/systemctl restart sackd.service
slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmctld.service
"""
sudoers_file = Path("/etc/sudoers.d/slurm")
Expand Down Expand Up @@ -366,27 +367,25 @@ def setup_login():
slurmctld_host = f"{lookup().control_host}"
if lookup().control_addr:
slurmctld_host = f"{lookup().control_host}({lookup().control_addr})"
slurmd_options = [
sackd_options = [
f'--conf-server="{slurmctld_host}:{lookup().control_host_port}"',
f'--conf="Feature={conf.login_nodeset}"',
"-Z",
]
sysconf = f"""SLURMD_OPTIONS='{" ".join(slurmd_options)}'"""
update_system_config("slurmd", sysconf)
sysconf = f"""SACKD_OPTIONS='{" ".join(sackd_options)}'"""
update_system_config("sackd", sysconf)
install_custom_scripts()

setup_network_storage()
setup_sudoers()
run("systemctl restart munge")
run("systemctl enable slurmd", timeout=30)
run("systemctl restart slurmd", timeout=30)
run("systemctl enable sackd", timeout=30)
run("systemctl restart sackd", timeout=30)
run("systemctl enable --now slurmcmd.timer", timeout=30)

run_custom_scripts()

log.info("Check status of cluster services")
run("systemctl status munge", timeout=30)
run("systemctl status slurmd", timeout=30)
run("systemctl status sackd", timeout=30)

log.info("Done setting up login")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -428,11 +428,16 @@ def reconfigure_slurm():
log.exception("failed to reconfigure slurmctld")
util.run(f"wall '{update_msg}'", timeout=30)
log.debug("Done.")
elif lookup().instance_role_safe in ["compute", "login"]:
elif lookup().instance_role_safe == "compute":
log.info("Restarting slurmd to make changes take effect.")
run("systemctl restart slurmd")
util.run(f"wall '{update_msg}'", timeout=30)
log.debug("Done.")
elif lookup().instance_role_safe == "login":
log.info("Restarting sackd to make changes take effect.")
run("systemctl restart sackd")
util.run(f"wall '{update_msg}'", timeout=30)
log.debug("Done.")


def update_topology(lkp: util.Lookup) -> None:
Expand Down
Loading