Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Serve] Make controller regions/ choose from replica resources #4053

Merged
merged 14 commits into from
Oct 24, 2024
39 changes: 29 additions & 10 deletions sky/utils/controller_utils.py
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -473,20 +473,35 @@ def get_controller_resources(
# the clouds of the resources if the controller does not exist.
# TODO(tian): Consider respecting the regions/zones specified for the
# resources as well.
requested_clouds: Set['clouds.Cloud'] = set()

requested_clouds_with_region_zone: Dict[str, Dict[Optional[str],
Set[Optional[str]]]] = {}
for resource in task_resources:
# cloud is an object and will not be able to be distinguished by set.
# Here we manually check if the cloud is in the set.
if resource.cloud is not None:
if not clouds.cloud_in_iterable(resource.cloud, requested_clouds):
cloud_name = str(resource.cloud) if resource.cloud is not None else None
if cloud_name is not None:
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
if cloud_name not in requested_clouds_with_region_zone:
try:
resource.cloud.check_features_are_supported(
resources.Resources(),
{clouds.CloudImplementationFeatures.HOST_CONTROLLERS})
except exceptions.NotSupportedError:
# Skip the cloud if it does not support hosting controllers.
continue
requested_clouds.add(resource.cloud)
requested_clouds_with_region_zone[cloud_name] = {}
if resource.region is None:
requested_clouds_with_region_zone[cloud_name] = {None: {None}}
elif None not in requested_clouds_with_region_zone[cloud_name]:
if resource.region not in requested_clouds_with_region_zone[
cloud_name]:
requested_clouds_with_region_zone[cloud_name][
resource.region] = set()
if resource.zone is None:
requested_clouds_with_region_zone[cloud_name][
resource.region] = {None}
elif None not in requested_clouds_with_region_zone[cloud_name][
resource.region]:
requested_clouds_with_region_zone[cloud_name][
resource.region].add(resource.zone)
else:
# if one of the resource.cloud is None, this could represent user
# does not know which cloud is best for the specified resources.
Expand All @@ -496,13 +511,17 @@ def get_controller_resources(
# - cloud: runpod
# accelerators: A40
# In this case, we allow the controller to be launched on any cloud.
requested_clouds.clear()
requested_clouds_with_region_zone.clear()
break
if not requested_clouds:
if not requested_clouds_with_region_zone:
return {controller_resources_to_use}
return {
controller_resources_to_use.copy(cloud=controller_cloud)
for controller_cloud in requested_clouds
controller_resources_to_use.copy(
cloud=clouds.CLOUD_REGISTRY.from_str(cloud_name),
region=region,
zone=zone)
for cloud_name, regions in requested_clouds_with_region_zone.items()
for region, zones in regions.items() for zone in zones
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
}


Expand Down
78 changes: 78 additions & 0 deletions tests/unit_tests/test_controller_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,81 @@ def _could_host_controllers(cloud: sky.clouds.Cloud) -> bool:
assert len(controller_resources) == 1
config = list(controller_resources)[0].to_yaml_config()
assert config == default_controller_resources, config

# 4. All resources have clouds, regions, and zones specified.
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
# Return a set with all combinations.
all_cloud_regions_zones = [
sky.Resources(cloud=sky.AWS(), region='us-east-1', zone='us-east-1a'),
sky.Resources(cloud=sky.AWS(), region='ap-south-1', zone='ap-south-1b'),
sky.Resources(cloud=sky.GCP(),
region='us-central1',
zone='us-central1-a'),
sky.Resources(cloud=sky.GCP(),
region='europe-west1',
zone='europe-west1-b')
]
controller_resources = controller_utils.get_controller_resources(
controller=controller_utils.Controllers.from_type(controller_type),
task_resources=all_cloud_regions_zones)
expected_combinations = {('AWS', 'us-east-1', 'us-east-1a'),
('AWS', 'ap-south-1', 'ap-south-1b'),
('GCP', 'us-central1', 'us-central1-a'),
('GCP', 'europe-west1', 'europe-west1-b')}
for r in controller_resources:
config = r.to_yaml_config()
cloud = config.pop('cloud')
region = config.pop('region')
zone = config.pop('zone')
assert (cloud, region, zone) in expected_combinations
expected_combinations.remove((cloud, region, zone))
assert config == default_controller_resources, config
assert not expected_combinations

# 5. Clouds and regions are specified but zones are only partially specified.
controller_resources = controller_utils.get_controller_resources(
controller=controller_utils.Controllers.from_type(controller_type),
task_resources=[
sky.Resources(cloud=sky.AWS(), region='us-west-2'),
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
sky.Resources(cloud=sky.GCP(),
region='us-central1',
zone='us-central1-a')
])
expected_combinations = {('AWS', 'us-west-2', None),
('GCP', 'us-central1', 'us-central1-a')}
for r in controller_resources:
config = r.to_yaml_config()
cloud = config.pop('cloud')
region = config.pop('region')
zone = config.pop('zone', None)
assert (cloud, region, zone) in expected_combinations
expected_combinations.remove((cloud, region, zone))
assert config == default_controller_resources, config
assert not expected_combinations

# 6. Mixed case: Some resources have clouds and regions or zones, others do not.
controller_resources = controller_utils.get_controller_resources(
controller=controller_utils.Controllers.from_type(controller_type),
task_resources=[
sky.Resources(cloud=sky.GCP(), region='europe-west1'),
sky.Resources(cloud=sky.GCP()),
sky.Resources(cloud=sky.AWS(),
region='eu-north-1',
zone='eu-north-1a'),
sky.Resources(cloud=sky.AWS(), region='eu-north-1'),
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
sky.Resources(cloud=sky.Azure()),
])
expected_combinations = {
('AWS', 'eu-north-1', None),
('GCP', None, None),
('Azure', None, None),
}
assert len(controller_resources) == len(expected_combinations)
for r in controller_resources:
config = r.to_yaml_config()
cloud = config.pop('cloud')
region = config.pop('region', None)
zone = config.pop('zone', None)
assert (cloud, region, zone) in expected_combinations
expected_combinations.remove((cloud, region, zone))
assert config == default_controller_resources, config
assert not expected_combinations
Loading