diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index b92c05511a4..b2f1bcae629 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3680,7 +3680,8 @@ def run_on_head( @timeline.event def _check_existing_cluster( - self, task: task_lib.Task, to_provision: resources_lib.Resources, + self, task: task_lib.Task, + to_provision: Optional[resources_lib.Resources], cluster_name: str) -> RetryingVmProvisioner.ToProvisionConfig: """Checks if the cluster exists and returns the provision config. @@ -3690,6 +3691,8 @@ def _check_existing_cluster( exceptions.InvalidClusterNameError: If the cluster name is invalid. # TODO(zhwu): complete the list of exceptions. """ + handle_before_refresh = global_user_state.get_handle_from_cluster_name( + cluster_name) prev_cluster_status, handle = ( backend_utils.refresh_cluster_status_handle( cluster_name, acquire_per_cluster_status_lock=False)) @@ -3713,6 +3716,22 @@ def _check_existing_cluster( if resources.cloud is not None else clouds.Cloud) task_cloud.check_cluster_name_is_valid(cluster_name) + if to_provision is None: + logger.info( + f'The cluster {cluster_name!r} was autodowned or manually ' + 'terminated on the cloud console. Using the same resources ' + 'as the previously terminated one to provision a new cluster.') + # The cluster is recently terminated either by autostop or manually + # terminated on the cloud. We should use the previously terminated + # resources to provision the cluster. + assert isinstance( + handle_before_refresh, CloudVmRayResourceHandle), ( + f'Trying to launch cluster {cluster_name!r} recently ' + 'terminated on the cloud, but the handle is not a ' + f'CloudVmRayResourceHandle ({handle_before_refresh}).') + to_provision = handle_before_refresh.launched_resources + self.check_resources_fit_cluster(handle_before_refresh, task) + cloud = to_provision.cloud if isinstance(cloud, clouds.Local): # The field ssh_user is specified in the cluster config file. diff --git a/sky/skylet/providers/oci/query_helper.py b/sky/skylet/providers/oci/query_helper.py index 28aadf90341..29601192e5d 100644 --- a/sky/skylet/providers/oci/query_helper.py +++ b/sky/skylet/providers/oci/query_helper.py @@ -57,7 +57,7 @@ def query_instances_by_tags(cls, tag_filters, region): @classmethod def terminate_instances_by_tags(cls, tag_filters, region) -> int: - logger.info(f"Terminate instance by tags: {tag_filters}") + logger.debug(f"Terminate instance by tags: {tag_filters}") insts = cls.query_instances_by_tags(tag_filters, region) fail_count = 0 for inst in insts: @@ -73,7 +73,7 @@ def terminate_instances_by_tags(cls, tag_filters, region) -> int: traceback.print_exc() if fail_count == 0: - logger.info(f"Instance teardown result: OK") + logger.debug(f"Instance teardown result: OK") else: logger.warn(f"Instance teardown result: {fail_count} failed!")