Skip to content

Commit

Permalink
[Azure] Allow resource group specifiation for Azure instance provisio…
Browse files Browse the repository at this point in the history
…ning (skypilot-org#3764)

* Allow resource group specifiation for Azure instance provisioning

* Add 'use_external_resource_group' under provider config

* nit

* attached resources deletion

* support deployment removal when terminating

* nit

* delete RoleAssignment when terminating

* update ARM config template

* nit

* nit

* delete role assignment with guid

* update role assignment removal logic

* Separate resource group region and VM, attached resources

* nit

* nit

* nit

* nit

* add error handling for deletion

* format

* deployment naming update

* test

* nit

* update deployment constant names

* update open_ports to wait for the nsg creation corresponding to the VM being provisioned

* format

* nit

* format

* update docstring

* add back deleted snippet

* format

* delete nic with retries

* error handle update
  • Loading branch information
landscapepainter authored and AlexCuadron committed Nov 7, 2024
1 parent e2a6a7a commit a761dcb
Show file tree
Hide file tree
Showing 9 changed files with 372 additions and 118 deletions.
3 changes: 3 additions & 0 deletions sky/adaptors/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ def get_client(name: str,
from azure.mgmt import authorization
return authorization.AuthorizationManagementClient(
credential, subscription_id)
elif name == 'msi':
from azure.mgmt import msi
return msi.ManagedServiceIdentityClient(credential, subscription_id)
elif name == 'graph':
import msgraph
return msgraph.GraphServiceClient(credential)
Expand Down
11 changes: 10 additions & 1 deletion sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from sky import clouds
from sky import exceptions
from sky import sky_logging
from sky import skypilot_config
from sky.adaptors import azure
from sky.clouds import service_catalog
from sky.clouds.utils import azure_utils
Expand Down Expand Up @@ -353,6 +354,13 @@ def make_deploy_resources_variables(
need_nvidia_driver_extension = (acc_dict is not None and
'A10' in acc_dict)

# Determine resource group for deploying the instance.
resource_group_name = skypilot_config.get_nested(
('azure', 'resource_group_vm'), None)
use_external_resource_group = resource_group_name is not None
if resource_group_name is None:
resource_group_name = f'{cluster_name.name_on_cloud}-{region_name}'

# Setup commands to eliminate the banner and restart sshd.
# This script will modify /etc/ssh/sshd_config and add a bash script
# into .bashrc. The bash script will restart sshd if it has not been
Expand Down Expand Up @@ -409,7 +417,8 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
'disk_tier': Azure._get_disk_type(disk_tier),
'cloud_init_setup_commands': cloud_init_setup_commands,
'azure_subscription_id': self.get_project_id(dryrun),
'resource_group': f'{cluster_name.name_on_cloud}-{region_name}',
'resource_group': resource_group_name,
'use_external_resource_group': use_external_resource_group,
}

# Setting disk performance tier for high disk tier.
Expand Down
8 changes: 7 additions & 1 deletion sky/provision/azure/azure-config-template.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
"description": "Subnet parameters."
}
},
"location": {
"type": "string",
"metadata": {
"description": "Location of where the resources are allocated."
}
},
"nsgName": {
"type": "string",
"metadata": {
Expand All @@ -23,7 +29,7 @@
},
"variables": {
"contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
"location": "[resourceGroup().location]",
"location": "[parameters('location')]",
"msiName": "[concat('sky-', parameters('clusterId'), '-msi')]",
"roleAssignmentName": "[concat('sky-', parameters('clusterId'), '-ra')]",
"nsgName": "[parameters('nsgName')]",
Expand Down
110 changes: 65 additions & 45 deletions sky/provision/azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,12 @@
from sky import sky_logging
from sky.adaptors import azure
from sky.provision import common
from sky.provision import constants
from sky.utils import common_utils

logger = sky_logging.init_logger(__name__)

UNIQUE_ID_LEN = 4
_DEPLOYMENT_NAME = 'skypilot-config'
_LEGACY_DEPLOYMENT_NAME = 'ray-config'
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT = 480 # 8 minutes
_CLUSTER_ID = '{cluster_name_on_cloud}-{unique_id}'

Expand Down Expand Up @@ -82,46 +81,55 @@ def bootstrap_instances(
in provider_config), 'Provider config must include location field'
params = {'location': provider_config['location']}

assert ('use_external_resource_group'
in provider_config), ('Provider config must include '
'use_external_resource_group field')
use_external_resource_group = provider_config['use_external_resource_group']

if 'tags' in provider_config:
params['tags'] = provider_config['tags']

logger.info(f'Creating/Updating resource group: {resource_group}')
rg_create_or_update = get_azure_sdk_function(
client=resource_client.resource_groups,
function_name='create_or_update')
rg_creation_start = time.time()
retry = 0
while (time.time() - rg_creation_start <
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
try:
rg_create_or_update(resource_group_name=resource_group,
parameters=params)
break
except azure.exceptions().ResourceExistsError as e:
if 'ResourceGroupBeingDeleted' in str(e):
if retry % 5 == 0:
logger.info(
f'Azure resource group {resource_group} of a recent '
f'terminated cluster {cluster_name_on_cloud} is being '
'deleted. It can only be provisioned after it is fully '
'deleted. Waiting...')
time.sleep(1)
retry += 1
continue
raise
except azure.exceptions().ClientAuthenticationError as e:
# When resource group is user specified, it already exists in certain
# region.
if not use_external_resource_group:
logger.info(f'Creating/Updating resource group: {resource_group}')
rg_create_or_update = get_azure_sdk_function(
client=resource_client.resource_groups,
function_name='create_or_update')
rg_creation_start = time.time()
retry = 0
while (time.time() - rg_creation_start <
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
try:
rg_create_or_update(resource_group_name=resource_group,
parameters=params)
break
except azure.exceptions().ResourceExistsError as e:
if 'ResourceGroupBeingDeleted' in str(e):
if retry % 5 == 0:
logger.info(
f'Azure resource group {resource_group} of a '
'recent terminated cluster '
f'{cluster_name_on_cloud} is being deleted. It can'
' only be provisioned after it is fully deleted. '
'Waiting...')
time.sleep(1)
retry += 1
continue
raise
except azure.exceptions().ClientAuthenticationError as e:
message = (
'Failed to authenticate with Azure. Please check your '
'Azure credentials. Error: '
f'{common_utils.format_exception(e)}').replace('\n', ' ')
logger.error(message)
raise exceptions.NoClusterLaunchedError(message) from e
else:
message = (
'Failed to authenticate with Azure. Please check your Azure '
f'credentials. Error: {common_utils.format_exception(e)}'
).replace('\n', ' ')
f'Timed out waiting for resource group {resource_group} to be '
'deleted.')
logger.error(message)
raise exceptions.NoClusterLaunchedError(message) from e
else:
message = (
f'Timed out waiting for resource group {resource_group} to be '
'deleted.')
logger.error(message)
raise TimeoutError(message)
raise TimeoutError(message)

# load the template file
current_path = Path(__file__).parent
Expand Down Expand Up @@ -155,6 +163,9 @@ def bootstrap_instances(
'nsgName': {
'value': nsg_name
},
'location': {
'value': params['location']
}
},
}
}
Expand All @@ -164,11 +175,22 @@ def bootstrap_instances(
get_deployment = get_azure_sdk_function(client=resource_client.deployments,
function_name='get')
deployment_exists = False
for deployment_name in [_DEPLOYMENT_NAME, _LEGACY_DEPLOYMENT_NAME]:
if use_external_resource_group:
deployment_name = (
constants.EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME.format(
cluster_name_on_cloud=cluster_name_on_cloud))
deployment_list = [deployment_name]
else:
deployment_name = constants.DEPLOYMENT_NAME
deployment_list = [
constants.DEPLOYMENT_NAME, constants.LEGACY_DEPLOYMENT_NAME
]

for deploy_name in deployment_list:
try:
deployment = get_deployment(resource_group_name=resource_group,
deployment_name=deployment_name)
logger.info(f'Deployment {deployment_name!r} already exists. '
deployment_name=deploy_name)
logger.info(f'Deployment {deploy_name!r} already exists. '
'Skipping deployment creation.')

outputs = deployment.properties.outputs
Expand All @@ -179,22 +201,20 @@ def bootstrap_instances(
deployment_exists = False

if not deployment_exists:
logger.info(f'Creating/Updating deployment: {_DEPLOYMENT_NAME}')
logger.info(f'Creating/Updating deployment: {deployment_name}')
create_or_update = get_azure_sdk_function(
client=resource_client.deployments,
function_name='create_or_update')
# TODO (skypilot): this takes a long time (> 40 seconds) to run.
outputs = create_or_update(
resource_group_name=resource_group,
deployment_name=_DEPLOYMENT_NAME,
deployment_name=deployment_name,
parameters=parameters,
).result().properties.outputs

nsg_id = outputs['nsg']['value']

# append output resource ids to be used with vm creation
provider_config['msi'] = outputs['msi']['value']
provider_config['nsg'] = nsg_id
provider_config['nsg'] = outputs['nsg']['value']
provider_config['subnet'] = outputs['subnet']['value']

return config
Loading

0 comments on commit a761dcb

Please sign in to comment.