flyteorg · wild-endeavor · May 3, 2023 · Apr 5, 2023 · Apr 7, 2023 · Apr 8, 2023
@@ -2,6 +2,9 @@
 
 This plugin uses the Kubeflow PyTorch Operator and provides an extremely simplified interface for executing distributed training using various PyTorch backends.
 
+This plugin can execute torch elastic training, which is equivalent to run `torchrun`. Elastic training can be executed
+in a single Pod (without requiring the PyTorch operator, see below) as well as in a distributed multi-node manner.
+
 To install the plugin, run the following command:
 
 ```bash

@@ -10,4 +10,4 @@
    PyTorch
 """
 
-from .task import PyTorch
+from .task import Elastic, PyTorch
@@ -2,16 +2,18 @@
 This Plugin adds the capability of running distributed pytorch training to Flyte using backend plugins, natively on
 Kubernetes. It leverages `Pytorch Job <https://github.com/kubeflow/pytorch-operator>`_ Plugin from kubeflow.
 """
+import os
 from dataclasses import dataclass
-from typing import Any, Callable, Dict
+from typing import Any, Callable, Dict, Optional, Union
 
+import cloudpickle
+from flyteidl.plugins.pytorch_pb2 import DistributedPyTorchTrainingTask, ElasticConfig
 from google.protobuf.json_format import MessageToDict
 
+import flytekit
 from flytekit import PythonFunctionTask
 from flytekit.configuration import SerializationSettings
-from flytekit.extend import TaskPlugins
-
-from .models import PyTorchJob
+from flytekit.extend import IgnoreOutputs, TaskPlugins
 
 
 @dataclass
@@ -29,6 +31,36 @@ class PyTorch(object):
     num_workers: int
 
 
+@dataclass
+class Elastic(object):
+    """
+    Configuration for `torch elastic training <https://pytorch.org/docs/stable/elastic/run.html>`_.
+
+    Use this to run single- or multi-node distributed pytorch elastic training on k8s.
+
+    Single-node elastic training is executed in a k8s pod when `replicas` is set to 1.
+    Multi-node training is executed otherwise using a `Pytorch Job <https://github.com/kubeflow/training-operator>`_.
+
+    Args:
+        replicas int: Number of nodes
+        min_replicas int: Lower limit for the number of replicas to which the training job can scale down
+        max_replicas int: Upper limit for the number of replicas to which the training job can scale up.
+            Cannot be smaller than min_replicas.
+        nproc_per_node (Union[int, str]): Number of workers per node. Supported values are [auto, cpu, gpu, int].
+        start_method (str): Multiprocessing start method to use when creating workers.
+        monitor_interval (int): Interval, in seconds, to monitor the state of workers.
+        max_restarts (int): Maximum number of worker group restarts before failing.
+    """
+
+    replicas: int = 1
+    min_replicas: Optional[int] = None
+    max_replicas: Optional[int] = None
+    nproc_per_node: Union[int, str] = "auto"
+    start_method: str = "spawn"
+    monitor_interval: int = 5
+    max_restarts: int = 0
+
+
 class PyTorchFunctionTask(PythonFunctionTask[PyTorch]):
     """
     Plugin that submits a PyTorchJob (see https://github.com/kubeflow/pytorch-operator)
@@ -46,9 +78,161 @@ def __init__(self, task_config: PyTorch, task_function: Callable, **kwargs):
         )
 
     def get_custom(self, settings: SerializationSettings) -> Dict[str, Any]:
-        job = PyTorchJob(workers_count=self.task_config.num_workers)
-        return MessageToDict(job.to_flyte_idl())
+        job = DistributedPyTorchTrainingTask(workers=self.task_config.num_workers)
+        return MessageToDict(job)
 
 
 # Register the Pytorch Plugin into the flytekit core plugin system
 TaskPlugins.register_pythontask_plugin(PyTorch, PyTorchFunctionTask)
+
+
+def spawn_helper(fn: bytes, kwargs) -> Any:
+    """Help to spawn worker processes.
+
+    The purpose of this function is to 1) be pickleable so that it can be used with
+    the multiprocessing start method `spawn` and 2) to call a cloudpickle-serialized
+    function passed to it. This function itself doesn't have to be pickleable. Without
+    such a helper task functions, which are not pickleable, couldn't be used with the
+    start method `spawn`.
+
+    Args:
+        fn (bytes): Cloudpickle-serialized target function to be executed in the worker process.
+
+    Returns:
+        The return value of the received target function.
+    """
+    fn = cloudpickle.loads(fn)
+    return_val = fn(**kwargs)
+    return return_val
+
+
+class PytorchElasticFunctionTask(PythonFunctionTask[Elastic]):
+    """
+    Plugin for distributed training with torch elastic/torchrun (see
+    https://pytorch.org/docs/stable/elastic/run.html).
+    """
+
+    _ELASTIC_TASK_TYPE = "pytorch"
+    _ELASTIC_TASK_TYPE_STANDALONE = "python-task"
+
+    def __init__(self, task_config: Elastic, task_function: Callable, **kwargs):
+        task_type = self._ELASTIC_TASK_TYPE_STANDALONE if task_config.replicas == 1 else self._ELASTIC_TASK_TYPE
+
+        super(PytorchElasticFunctionTask, self).__init__(
+            task_config=task_config,
+            task_type=task_type,
+            task_function=task_function,
+            **kwargs,
+        )
+        self.min_replicas = self.task_config.min_replicas or self.task_config.replicas
+        self.max_replicas = self.task_config.max_replicas or self.task_config.replicas
+
+        if not (self.min_replicas <= self.task_config.replicas <= self.max_replicas):
+            raise ValueError("Replica config violates `min_replicas <= replicas <= max_replicas`.")
+
+        """
+        c10d is the backend recommended by torch elastic.
+        https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend
+
+        For c10d, no backend server has to be deployed.
+        https://pytorch.org/docs/stable/elastic/run.html#deployment
+        Instead, the workers will use the master's address as the rendezvous point.
+        """
+        self.rdzv_backend = "c10d"
+
+    def execute(self, **kwargs) -> Any:
+        """
+        This helper method will be invoked to execute the task.
+
+
+        Returns:
+            The result of rank zero.
+        """
+        try:
+            from torch.distributed import run
+            from torch.distributed.launcher.api import LaunchConfig, elastic_launch
+        except ImportError:
+            raise ImportError("PyTorch is not installed. Please install `flytekitplugins-kfpytorch['elastic']`.")
+
+        if isinstance(self.task_config.nproc_per_node, str):
+            nproc = run.determine_local_world_size(self.task_config.nproc_per_node)
+        else:
+            nproc = self.task_config.nproc_per_node
+
+        config = LaunchConfig(
+            run_id=flytekit.current_context().execution_id.name,
+            min_nodes=self.min_replicas,
+            max_nodes=self.max_replicas,
+            nproc_per_node=nproc,
+            rdzv_backend=self.rdzv_backend,  # rdzv settings
+            rdzv_endpoint=os.environ.get("PET_RDZV_ENDPOINT", "localhost:0"),
+            max_restarts=self.task_config.max_restarts,
+            monitor_interval=self.task_config.monitor_interval,
+            start_method=self.task_config.start_method,
+        )
+
+        if self.task_config.start_method == "spawn":
+            """
+            We use cloudpickle to serialize the non-pickleable task function.
+            The torch elastic launcher then launches the spawn_helper function (which is pickleable)
+            instead of the task function. This helper function, in the child-process, then deserializes
+            the task function, again with cloudpickle, and executes it.
+            """
+            launcher_target_func = spawn_helper
+
+            dumped_target_function = cloudpickle.dumps(self._task_function)
+            launcher_args = (dumped_target_function, kwargs)
+        elif self.task_config.start_method == "fork":
+            """
+            The torch elastic launcher doesn't support passing kwargs to the target function,
+            only args. Flyte only works with kwargs. Thus, we create a closure which already has
+            the task kwargs bound. We tell the torch elastic launcher to start this function in
+            the child processes.
+            """
+
+            def fn_partial():
+                """Closure of the task function with kwargs already bound."""
+                return self._task_function(**kwargs)
+
+            launcher_target_func = fn_partial
+            launcher_args = ()
+
+        else:
+            raise Exception("Bad start method")
+
+        out = elastic_launch(
+            config=config,
+            entrypoint=launcher_target_func,
+        )(*launcher_args)
+
+        # `out` is a dictionary of rank (not local rank) -> result
+        # Rank 0 returns the result of the task function
+        if 0 in out:
+            return out[0]
+        else:
+            raise IgnoreOutputs()
+
+    def get_custom(self, settings: SerializationSettings) -> Optional[Dict[str, Any]]:
+        if self.task_config.replicas == 1:
+            """
+            Torch elastic distributed training is executed in a normal k8s pod so that this
+            works without the kubeflow train operator.
+            """
+            return super().get_custom(settings)
+        else:
+            elastic_config = ElasticConfig(
+                rdzv_backend=self.rdzv_backend,
+                min_replicas=self.min_replicas,
+                max_replicas=self.max_replicas,
+                nproc_per_node=self.task_config.nproc_per_node,
+                max_restarts=self.task_config.max_restarts,
+            )
+            job = DistributedPyTorchTrainingTask(
+                workers=self.task_config.replicas,
+                elastic_config=elastic_config,
+            )
+            return MessageToDict(job)
+
+
+# Register the PytorchElastic Plugin into the flytekit core plugin system
+TaskPlugins.register_pythontask_plugin(Elastic, PytorchElasticFunctionTask)
@@ -4,7 +4,7 @@
 
 microlib_name = f"flytekitplugins-{PLUGIN_NAME}"
 
-plugin_requires = ["flytekit>=1.3.0b2,<2.0.0"]
+plugin_requires = ["cloudpickle", "flytekit>=1.3.0,<2.0.0", "flyteidl>=1.3.19"]
 
 __version__ = "0.0.0+develop"
 
@@ -17,6 +17,9 @@
     namespace_packages=["flytekitplugins"],
     packages=[f"flytekitplugins.{PLUGIN_NAME}"],
     install_requires=plugin_requires,
+    extras_require={
+        "elastic": ["torch>=1.9.0"],
+    },
     license="apache2",
     python_requires=">=3.8",
     classifiers=[

@@ -0,0 +1,73 @@
+import os
+from dataclasses import dataclass
+
+import pytest
+import torch
+import torch.distributed as dist
+from dataclasses_json import dataclass_json
+from flytekitplugins.kfpytorch.task import Elastic
+
+from flytekit import task, workflow
+
+
+@dataclass_json
+@dataclass
+class Config:
+    lr: float = 1e-5
+    bs: int = 64
+    name: str = "foo"
+
+
+def dist_communicate() -> int:
+    """Communicate between distributed workers."""
+    rank = torch.distributed.get_rank()
+    world_size = dist.get_world_size()
+    tensor = torch.tensor([5], dtype=torch.int64) + 2 * rank + world_size
+    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+
+    return tensor.item()
+
+
+def train(config: Config) -> tuple[str, Config, torch.nn.Module, int]:
+    """Mock training a model using torch-elastic for test purposes."""
+    dist.init_process_group(backend="gloo")
+
+    local_rank = os.environ["LOCAL_RANK"]
+
+    out_model = torch.nn.Linear(1000, int(local_rank) + 1)
+    config.name = "elastic-test"
+
+    distributed_result = dist_communicate()
+
+    return f"result from local rank {local_rank}", config, out_model, distributed_result
+
+
+@pytest.mark.parametrize("start_method", ["spawn", "fork"])
+def test_end_to_end(start_method: str) -> None:
+    """Test that the workflow with elastic task runs end to end."""
+    world_size = 2
+
+    train_task = task(train, task_config=Elastic(replicas=1, nproc_per_node=world_size, start_method=start_method))
+
+    @workflow
+    def wf(config: Config = Config()) -> tuple[str, Config, torch.nn.Module, int]:
+        return train_task(config=config)
+
+    r, cfg, m, distributed_result = wf()
+    assert "result from local rank 0" in r
+    assert cfg.name == "elastic-test"
+    assert m.in_features == 1000
+    assert m.out_features == 1
+    """
+    The distributed result is calculated by the workers of the elastic train
+    task by performing a `dist.all_reduce` operation. The correct result can
+    only be obtained if the distributed process group is initialized correctly.
+    """
+    assert distributed_result == sum([5 + 2 * rank + world_size for rank in range(world_size)])
+
+
+def test_bad_replica_config() -> None:
+    """Test that bad replica config is caught."""
+
+    with pytest.raises(ValueError):
+        task(train, task_config=Elastic(replicas=1, min_replicas=2))