Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[UX] sky launch --fast #4159

Merged
merged 11 commits into from
Oct 31, 2024
2 changes: 1 addition & 1 deletion sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2737,7 +2737,7 @@ def _provision(
(e.g., cluster name invalid) or a region/zone throwing
resource unavailability.
exceptions.CommandError: any ssh command error.
RuntimeErorr: raised when 'rsync' is not installed.
RuntimeError: raised when 'rsync' is not installed.
# TODO(zhwu): complete the list of exceptions.
"""
# FIXME: ray up for Azure with different cluster_names will overwrite
Expand Down
13 changes: 12 additions & 1 deletion sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,7 @@ def _launch_with_confirm(
retry_until_up: bool = False,
no_setup: bool = False,
clone_disk_from: Optional[str] = None,
fast: bool = False,
):
"""Launch a cluster with a Task."""
if cluster is None:
Expand Down Expand Up @@ -613,6 +614,7 @@ def _launch_with_confirm(
retry_until_up=retry_until_up,
no_setup=no_setup,
clone_disk_from=clone_disk_from,
fast=fast,
)


Expand Down Expand Up @@ -1034,6 +1036,13 @@ def cli():
help=('[Experimental] Clone disk from an existing cluster to launch '
'a new one. This is useful when the new cluster needs to have '
'the same data on the boot disk as an existing cluster.'))
@click.option(
'--fast',
is_flag=True,
default=False,
required=False,
help=('[Experimental] If the cluster is already up and available, skip'
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
'provisioning and setup steps.'))
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
@usage_lib.entrypoint
def launch(
entrypoint: Tuple[str, ...],
Expand Down Expand Up @@ -1065,6 +1074,7 @@ def launch(
yes: bool,
no_setup: bool,
clone_disk_from: Optional[str],
fast: bool,
):
"""Launch a cluster or task.

Expand Down Expand Up @@ -1133,7 +1143,8 @@ def launch(
down=down,
retry_until_up=retry_until_up,
no_setup=no_setup,
clone_disk_from=clone_disk_from)
clone_disk_from=clone_disk_from,
fast=fast)


@cli.command(cls=_DocumentedCodeCommand)
Expand Down
41 changes: 39 additions & 2 deletions sky/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from sky import optimizer
from sky import sky_logging
from sky.backends import backend_utils
from sky.exceptions import ClusterNotUpError
from sky.usage import usage_lib
from sky.utils import admin_policy_utils
from sky.utils import controller_utils
Expand Down Expand Up @@ -215,7 +216,8 @@ def _execute(
'(after all jobs finish).'
f'{colorama.Style.RESET_ALL}')
idle_minutes_to_autostop = 1
stages.remove(Stage.DOWN)
if Stage.DOWN in stages:
stages.remove(Stage.DOWN)
if idle_minutes_to_autostop >= 0:
requested_features.add(
clouds.CloudImplementationFeatures.AUTO_TERMINATE)
Expand Down Expand Up @@ -354,6 +356,7 @@ def launch(
detach_run: bool = False,
no_setup: bool = False,
clone_disk_from: Optional[str] = None,
fast: bool = False,
# Internal only:
# pylint: disable=invalid-name
_is_launched_by_jobs_controller: bool = False,
Expand Down Expand Up @@ -408,6 +411,8 @@ def launch(
clone_disk_from: [Experimental] if set, clone the disk from the
specified cluster. This is useful to migrate the cluster to a
different availability zone or region.
fast: [Experimental] If the cluster is already up and available,
skip provisioning and setup steps.

Example:
.. code-block:: python
Expand Down Expand Up @@ -451,15 +456,47 @@ def launch(
controller_utils.check_cluster_name_not_controller(
cluster_name, operation_str='sky.launch')

handle = None
stages = None
# Check if cluster exists and we are doing fast provisioning
if fast and cluster_name is not None:
maybe_handle = global_user_state.get_handle_from_cluster_name(
cluster_name)
if maybe_handle is not None:
try:
# This will throw if the cluster is not available
backend_utils.check_cluster_available(
cluster_name,
operation='executing tasks',
check_cloud_vm_ray_backend=False,
dryrun=dryrun)
# If the cluster is available, restrict stages
handle = maybe_handle
stages = [
# Stage.CLONE_DISK,
# Stage.PROVISION,
# Stage.OPTIMIZE,
Stage.SYNC_WORKDIR,
# Stage.SYNC_FILE_MOUNTS,
# Stage.SETUP,
# Stage.PRE_EXEC,
Stage.EXEC,
# Stage.DOWN
]
cg505 marked this conversation as resolved.
Show resolved Hide resolved
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
except ClusterNotUpError:
cg505 marked this conversation as resolved.
Show resolved Hide resolved
# Proceed with normal provisioning
pass

return _execute(
entrypoint=entrypoint,
dryrun=dryrun,
down=down,
stream_logs=stream_logs,
handle=None,
handle=handle,
backend=backend,
retry_until_up=retry_until_up,
optimize_target=optimize_target,
stages=stages,
cluster_name=cluster_name,
detach_setup=detach_setup,
detach_run=detach_run,
Expand Down
63 changes: 63 additions & 0 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,69 @@ def test_minimal(generic_cloud: str):
run_one_test(test)


# ---------- Test fast launch ----------
def test_launch_fast(generic_cloud: str):
name = _get_cluster_name()

test = Test(
'test_launch_fast',
[
# First launch to create the cluster
f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
f'sky logs {name} 1 --status',

# Second launch to test fast launch - should not reprovision
f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast tests/test_yamls/minimal.yaml) && '
' echo "$s" && '
# Validate that cluster was not re-launched.
'! echo "$s" | grep -A 1 "Launching on" | grep "is up." && '
# Validate that setup was not re-run.
'! echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && '
# Validate that the task ran and finished.
'echo "$s" | grep -A 1 "task run finish" | grep "Job finished (status: SUCCEEDED)"',
f'sky logs {name} 2 --status',
f'sky status -r {name} | grep UP',
],
f'sky down -y {name}',
timeout=_get_timeout(generic_cloud),
)
run_one_test(test)


# See cloud exclusion explanations in test_autostop
@pytest.mark.no_fluidstack
@pytest.mark.no_lambda_cloud
@pytest.mark.no_ibm
@pytest.mark.no_kubernetes
def test_launch_fast_with_autostop(generic_cloud: str):
name = _get_cluster_name()
# Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
# the VM is stopped.
autostop_timeout = 600 if generic_cloud == 'azure' else 250

test = Test(
'test_launch_fast_with_autostop',
[
# First launch to create the cluster with a short autostop
f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
f'sky logs {name} 1 --status',
f'sky status -r {name} | grep UP',
f'sleep {autostop_timeout}',

# Ensure cluster is stopped
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED',

# Launch again. Do full output validation - we expect the cluster to re-launch
f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
f'sky logs {name} 2 --status',
f'sky status -r {name} | grep UP',
],
f'sky down -y {name}',
timeout=_get_timeout(generic_cloud) + autostop_timeout,
)
run_one_test(test)


# ---------- Test region ----------
@pytest.mark.aws
def test_aws_region():
Expand Down
Loading