From 9356a5dd26598c4e68bdc6e377868526ee09f87d Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Thu, 31 Oct 2024 10:49:49 -0700 Subject: [PATCH 1/3] [ux] add sky jobs launch --fast This flag will make the jobs controller launch use `sky launch --fast`. There are a few known situations where this can cause misbehavior in the jobs controller: - The SkyPilot wheel is outdated (due to changes in the SkyPilot code or a version upgrade). - The user's cloud credentials have changed. In this case the new credentials will not be synced, and if there are new clouds available in `sky check`, the cloud depedencies may not be correctly installed. However, this does speed up `jobs launch` _significantly_, so provide it as a dangerous option. Soon we will add robustness checks to `sky launch --fast` that will fix the above caveats, and we can remove this flag and just enable the behavior by default. --- sky/cli.py | 13 ++++++++++++- sky/jobs/core.py | 6 ++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index ce3ad16ecb9..4e8b1f75f48 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3560,6 +3560,15 @@ def jobs(): default=False, required=False, help='Skip confirmation prompt.') +# TODO(cooperc): remove this flag once --fast can robustly detect cluster +# yaml config changes +@click.option('--fast', + default=False, + is_flag=True, + help='[Experimental] Launch the job more quickly, but skip some ' + 'initialization steps. If you update SkyPilot or your local ' + 'cloud credentials, they will not be reflected until you run ' + '`sky jobs launch` at least once without this flag.') @timeline.event @usage_lib.entrypoint def jobs_launch( @@ -3586,6 +3595,7 @@ def jobs_launch( detach_run: bool, retry_until_up: bool, yes: bool, + fast: bool, ): """Launch a managed job from a YAML or a command. @@ -3669,7 +3679,8 @@ def jobs_launch( managed_jobs.launch(dag, name, detach_run=detach_run, - retry_until_up=retry_until_up) + retry_until_up=retry_until_up, + fast=fast) @jobs.command('queue', cls=_DocumentedCodeCommand) diff --git a/sky/jobs/core.py b/sky/jobs/core.py index 6c1ac42d192..b43b9c591ab 100644 --- a/sky/jobs/core.py +++ b/sky/jobs/core.py @@ -36,6 +36,7 @@ def launch( stream_logs: bool = True, detach_run: bool = False, retry_until_up: bool = False, + fast: bool = False, ) -> None: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Launch a managed job. @@ -47,11 +48,15 @@ def launch( managed job. name: Name of the managed job. detach_run: Whether to detach the run. + fast: Whether to use sky.launch(fast=True) for the jobs controller. If + True, the SkyPilot wheel and the cloud credentials may not be updated + on the jobs controller. Raises: ValueError: cluster does not exist. Or, the entrypoint is not a valid chain dag. sky.exceptions.NotSupportedError: the feature is not supported. + """ entrypoint = task dag_uuid = str(uuid.uuid4().hex[:4]) @@ -138,6 +143,7 @@ def launch( idle_minutes_to_autostop=skylet_constants. CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, + fast=fast, _disable_controller_check=True) From cb7943227231b87ded9961f7cdb5cb7a271f353f Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Thu, 31 Oct 2024 12:46:36 -0700 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Romil Bhardwaj --- sky/cli.py | 2 +- sky/jobs/core.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 4e8b1f75f48..067f1a5d490 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3565,7 +3565,7 @@ def jobs(): @click.option('--fast', default=False, is_flag=True, - help='[Experimental] Launch the job more quickly, but skip some ' + help='[Experimental] Launch the job faster by skipping controller ' 'initialization steps. If you update SkyPilot or your local ' 'cloud credentials, they will not be reflected until you run ' '`sky jobs launch` at least once without this flag.') diff --git a/sky/jobs/core.py b/sky/jobs/core.py index b43b9c591ab..5668e19332a 100644 --- a/sky/jobs/core.py +++ b/sky/jobs/core.py @@ -56,7 +56,6 @@ def launch( ValueError: cluster does not exist. Or, the entrypoint is not a valid chain dag. sky.exceptions.NotSupportedError: the feature is not supported. - """ entrypoint = task dag_uuid = str(uuid.uuid4().hex[:4]) From 22b16617ff49b7089cb35b0bd195e39a886fd3e4 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Thu, 31 Oct 2024 14:04:34 -0700 Subject: [PATCH 3/3] fix lint --- sky/cli.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 067f1a5d490..462e8a5b9de 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3565,10 +3565,10 @@ def jobs(): @click.option('--fast', default=False, is_flag=True, - help='[Experimental] Launch the job faster by skipping controller ' - 'initialization steps. If you update SkyPilot or your local ' - 'cloud credentials, they will not be reflected until you run ' - '`sky jobs launch` at least once without this flag.') + help='[Experimental] Launch the job faster by skipping ' + 'controller initialization steps. If you update SkyPilot or ' + 'your local cloud credentials, they will not be reflected until ' + 'you run `sky jobs launch` at least once without this flag.') @timeline.event @usage_lib.entrypoint def jobs_launch(