Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Host Network for Kubernetes NCCL Tests #193

Merged
merged 1 commit into from
Sep 11, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,19 @@ def _create_job_spec(
"Launcher": {
"replicas": 1,
"template": {
"metadata": {"annotations": {"k8s.v1.cni.cncf.io/networks": "ipoib-cx7-h1@ib0"}},
"spec": {
"containers": [
{
"image": cmd_args["docker_image_url"],
"name": "nccl",
"name": "nccl-launcher",
"env": self._generate_env_list(env_vars),
"command": self._generate_launcher_command(
final_num_nodes, nodes, env_vars, cmd_args, extra_cmd_args
),
"command": ["/bin/bash"],
"args": [
"-c",
self._generate_launcher_command(
final_num_nodes, nodes, env_vars, cmd_args, extra_cmd_args
),
],
"resources": self._prepare_launcher_resources(),
}
],
Expand All @@ -113,13 +116,15 @@ def _create_job_spec(
"Worker": {
"replicas": final_num_nodes,
"template": {
"metadata": {"annotations": {"k8s.v1.cni.cncf.io/networks": "ipoib-cx7-h1@ib0"}},
"spec": {
"hostNetwork": True,
"containers": [
{
"image": cmd_args["docker_image_url"],
"name": "nccl-worker",
"env": self._generate_env_list(env_vars),
"name": "nccl",
"command": ["/bin/bash"],
"args": ["-c", "/usr/sbin/sshd -p 2222; sleep infinity"],
"resources": self._prepare_worker_resources(),
"volumeMounts": [
{"mountPath": "/dev/shm", "name": "dshm"},
Expand Down Expand Up @@ -160,7 +165,7 @@ def _generate_launcher_command(
env_vars: Dict[str, str],
cmd_args: Dict[str, str],
extra_cmd_args: str,
) -> List[str]:
) -> str:
"""
Generate the launcher command for the Kubernetes container.

Expand All @@ -172,7 +177,7 @@ def _generate_launcher_command(
extra_cmd_args (str): Additional command-line arguments for the NCCL test.

Returns:
List[str]: A list representing the launcher command to be executed.
str: The launcher command to be executed.
"""
subtest_name = cmd_args.get("subtest_name")
if subtest_name is None:
Expand Down Expand Up @@ -211,17 +216,13 @@ def _generate_launcher_command(
if extra_cmd_args:
command_parts.append(extra_cmd_args)

return [
"/bin/bash",
"-c",
(
f"mpirun -v --allow-run-as-root -np {final_num_nodes} "
"--hostfile /etc/mpi/hostfile "
"-mca coll ^hcoll -bind-to none "
f"{' '.join([f'-x {key}={value}' for key, value in env_vars.items()])} "
f"{' '.join(command_parts)}"
),
]
return (
f"mpirun -v --allow-run-as-root -np {final_num_nodes} "
"--hostfile /etc/mpi/hostfile "
"-mca coll ^hcoll -mca plm_rsh_args '-p 2222' -bind-to none "
f"{' '.join([f'-x {key}={value}' for key, value in env_vars.items()])} "
f"{' '.join(command_parts)}"
)

def _prepare_launcher_resources(self) -> Dict[str, Dict[str, str]]:
"""
Expand All @@ -231,8 +232,8 @@ def _prepare_launcher_resources(self) -> Dict[str, Dict[str, str]]:
Dict[str, Dict[str, str]]: A dictionary representing the resource requests and limits.
"""
return {
"requests": {"cpu": "2", "memory": "128Mi", "rdma/rdma_ib": "1"},
"limits": {"cpu": "2", "memory": "128Mi", "rdma/rdma_ib": "1"},
"requests": {"cpu": "2", "memory": "8Gi"},
"limits": {"cpu": "2", "memory": "8Gi"},
}

def _prepare_worker_resources(self) -> Dict[str, Dict[str, str]]:
Expand Down