diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index b0a064afe7c..67851ec0308 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2705,7 +2705,7 @@ def _provision( (e.g., cluster name invalid) or a region/zone throwing resource unavailability. exceptions.CommandError: any ssh command error. - RuntimeErorr: raised when 'rsync' is not installed. + RuntimeError: raised when 'rsync' is not installed. # TODO(zhwu): complete the list of exceptions. """ # FIXME: ray up for Azure with different cluster_names will overwrite diff --git a/sky/cli.py b/sky/cli.py index db1befb04a3..ce3ad16ecb9 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -555,6 +555,7 @@ def _launch_with_confirm( retry_until_up: bool = False, no_setup: bool = False, clone_disk_from: Optional[str] = None, + fast: bool = False, ): """Launch a cluster with a Task.""" if cluster is None: @@ -619,6 +620,7 @@ def _launch_with_confirm( retry_until_up=retry_until_up, no_setup=no_setup, clone_disk_from=clone_disk_from, + fast=fast, ) @@ -1040,6 +1042,13 @@ def cli(): help=('[Experimental] Clone disk from an existing cluster to launch ' 'a new one. This is useful when the new cluster needs to have ' 'the same data on the boot disk as an existing cluster.')) +@click.option( + '--fast', + is_flag=True, + default=False, + required=False, + help=('[Experimental] If the cluster is already up and available, skip ' + 'provisioning and setup steps.')) @usage_lib.entrypoint def launch( entrypoint: Tuple[str, ...], @@ -1071,6 +1080,7 @@ def launch( yes: bool, no_setup: bool, clone_disk_from: Optional[str], + fast: bool, ): """Launch a cluster or task. @@ -1139,7 +1149,8 @@ def launch( down=down, retry_until_up=retry_until_up, no_setup=no_setup, - clone_disk_from=clone_disk_from) + clone_disk_from=clone_disk_from, + fast=fast) @cli.command(cls=_DocumentedCodeCommand) diff --git a/sky/execution.py b/sky/execution.py index 42a39a90380..8fab5e583fb 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -11,6 +11,7 @@ from sky import admin_policy from sky import backends from sky import clouds +from sky import exceptions from sky import global_user_state from sky import optimizer from sky import sky_logging @@ -216,7 +217,8 @@ def _execute( '(after all jobs finish).' f'{colorama.Style.RESET_ALL}') idle_minutes_to_autostop = 1 - stages.remove(Stage.DOWN) + if Stage.DOWN in stages: + stages.remove(Stage.DOWN) if idle_minutes_to_autostop >= 0: requested_features.add( clouds.CloudImplementationFeatures.AUTO_TERMINATE) @@ -355,6 +357,7 @@ def launch( detach_run: bool = False, no_setup: bool = False, clone_disk_from: Optional[str] = None, + fast: bool = False, # Internal only: # pylint: disable=invalid-name _is_launched_by_jobs_controller: bool = False, @@ -409,6 +412,8 @@ def launch( clone_disk_from: [Experimental] if set, clone the disk from the specified cluster. This is useful to migrate the cluster to a different availability zone or region. + fast: [Experimental] If the cluster is already up and available, + skip provisioning and setup steps. Example: .. code-block:: python @@ -452,15 +457,43 @@ def launch( controller_utils.check_cluster_name_not_controller( cluster_name, operation_str='sky.launch') + handle = None + stages = None + # Check if cluster exists and we are doing fast provisioning + if fast and cluster_name is not None: + maybe_handle = global_user_state.get_handle_from_cluster_name( + cluster_name) + if maybe_handle is not None: + try: + # This will throw if the cluster is not available + backend_utils.check_cluster_available( + cluster_name, + operation='executing tasks', + check_cloud_vm_ray_backend=False, + dryrun=dryrun) + handle = maybe_handle + # Get all stages + stages = [ + Stage.SYNC_WORKDIR, + Stage.SYNC_FILE_MOUNTS, + Stage.PRE_EXEC, + Stage.EXEC, + Stage.DOWN, + ] + except exceptions.ClusterNotUpError: + # Proceed with normal provisioning + pass + return _execute( entrypoint=entrypoint, dryrun=dryrun, down=down, stream_logs=stream_logs, - handle=None, + handle=handle, backend=backend, retry_until_up=retry_until_up, optimize_target=optimize_target, + stages=stages, cluster_name=cluster_name, detach_setup=detach_setup, detach_run=detach_run, diff --git a/tests/test_smoke.py b/tests/test_smoke.py index cfe7652e693..61bf0954131 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -360,6 +360,69 @@ def test_minimal(generic_cloud: str): run_one_test(test) +# ---------- Test fast launch ---------- +def test_launch_fast(generic_cloud: str): + name = _get_cluster_name() + + test = Test( + 'test_launch_fast', + [ + # First launch to create the cluster + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'sky logs {name} 1 --status', + + # Second launch to test fast launch - should not reprovision + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast tests/test_yamls/minimal.yaml) && ' + ' echo "$s" && ' + # Validate that cluster was not re-launched. + '! echo "$s" | grep -A 1 "Launching on" | grep "is up." && ' + # Validate that setup was not re-run. + '! echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && ' + # Validate that the task ran and finished. + 'echo "$s" | grep -A 1 "task run finish" | grep "Job finished (status: SUCCEEDED)"', + f'sky logs {name} 2 --status', + f'sky status -r {name} | grep UP', + ], + f'sky down -y {name}', + timeout=_get_timeout(generic_cloud), + ) + run_one_test(test) + + +# See cloud exclusion explanations in test_autostop +@pytest.mark.no_fluidstack +@pytest.mark.no_lambda_cloud +@pytest.mark.no_ibm +@pytest.mark.no_kubernetes +def test_launch_fast_with_autostop(generic_cloud: str): + name = _get_cluster_name() + # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure + # the VM is stopped. + autostop_timeout = 600 if generic_cloud == 'azure' else 250 + + test = Test( + 'test_launch_fast_with_autostop', + [ + # First launch to create the cluster with a short autostop + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'sky logs {name} 1 --status', + f'sky status -r {name} | grep UP', + f'sleep {autostop_timeout}', + + # Ensure cluster is stopped + f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + + # Launch again. Do full output validation - we expect the cluster to re-launch + f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}', + f'sky logs {name} 2 --status', + f'sky status -r {name} | grep UP', + ], + f'sky down -y {name}', + timeout=_get_timeout(generic_cloud) + autostop_timeout, + ) + run_one_test(test) + + # ---------- Test region ---------- @pytest.mark.aws def test_aws_region(): @@ -4376,6 +4439,28 @@ def test_core_api_sky_launch_exec(): sky.down(name) +# The sky launch CLI has some additional checks to make sure the cluster is up/ +# restarted. However, the core API doesn't have these; make sure it still works +def test_core_api_sky_launch_fast(generic_cloud: str): + name = _get_cluster_name() + cloud = sky.clouds.CLOUD_REGISTRY.from_str(generic_cloud) + try: + task = sky.Task(run="whoami").set_resources(sky.Resources(cloud=cloud)) + sky.launch(task, + cluster_name=name, + idle_minutes_to_autostop=1, + fast=True) + # Sleep to let the cluster autostop + time.sleep(120) + # Run it again - should work with fast=True + sky.launch(task, + cluster_name=name, + idle_minutes_to_autostop=1, + fast=True) + finally: + sky.down(name) + + # ---------- Testing Storage ---------- class TestStorageWithCredentials: """Storage tests which require credentials and network connection"""