Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Launching cluster autodowned or manually terminated #2130

Merged
merged 7 commits into from
Jun 26, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3668,7 +3668,8 @@ def run_on_head(

@timeline.event
def _check_existing_cluster(
self, task: task_lib.Task, to_provision: resources_lib.Resources,
self, task: task_lib.Task,
to_provision: Optional[resources_lib.Resources],
cluster_name: str) -> RetryingVmProvisioner.ToProvisionConfig:
"""Checks if the cluster exists and returns the provision config.

Expand All @@ -3678,6 +3679,8 @@ def _check_existing_cluster(
exceptions.InvalidClusterNameError: If the cluster name is invalid.
# TODO(zhwu): complete the list of exceptions.
"""
previous_handle = global_user_state.get_handle_from_cluster_name(
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
cluster_name)
prev_cluster_status, handle = (
backend_utils.refresh_cluster_status_handle(
cluster_name, acquire_per_cluster_status_lock=False))
Expand All @@ -3701,6 +3704,20 @@ def _check_existing_cluster(
if resources.cloud is not None else clouds.Cloud)
task_cloud.check_cluster_name_is_valid(cluster_name)

if to_provision is None:
logger.info(
f'The cluster {cluster_name!r} was autodowned or manually '
'terminated on the cloud console. Using the original resources '
'to provision a new cluster.')
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
# The cluster is recently terminated either by autostop or manually
# terminated on the cloud. We should use the original resources to
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
# provision the cluster.
assert isinstance(previous_handle,
CloudVmRayResourceHandle), (previous_handle,
cluster_name)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Style nit: The code looks a bit ugly tbh.

Suggested change
assert isinstance(previous_handle,
CloudVmRayResourceHandle), (previous_handle,
cluster_name)
is_cloud_vm_handle = isinstance(previous_handle,
CloudVmRayResourceHandle)
assert is_cloud_vm_handle, (previous_handle, cluster_name)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, mypy is not clear enough to have this two-line assertion. I changed the output to be a string, so as to make it look a bit better. PTAL. : )

to_provision = previous_handle.launched_resources
self.check_resources_fit_cluster(previous_handle, task)

cloud = to_provision.cloud
if isinstance(cloud, clouds.Local):
# The field ssh_user is specified in the cluster config file.
Expand Down
4 changes: 2 additions & 2 deletions sky/skylet/providers/oci/query_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def query_instances_by_tags(cls, tag_filters, region):

@classmethod
def terminate_instances_by_tags(cls, tag_filters, region) -> int:
logger.info(f"Terminate instance by tags: {tag_filters}")
logger.debug(f"Terminate instance by tags: {tag_filters}")
insts = cls.query_instances_by_tags(tag_filters, region)
fail_count = 0
for inst in insts:
Expand All @@ -73,7 +73,7 @@ def terminate_instances_by_tags(cls, tag_filters, region) -> int:
traceback.print_exc()

if fail_count == 0:
logger.info(f"Instance teardown result: OK")
logger.debug(f"Instance teardown result: OK")
else:
logger.warn(f"Instance teardown result: {fail_count} failed!")

Expand Down