From 6adb5597026de103e1afe969012f4a0ada8a27b5 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Sat, 26 Aug 2023 18:49:31 +0200 Subject: [PATCH] Fix DNS name retrieval on Multi NICs instance Fix the DNS name retrieval on Multi NICs instance, where the PrivateDnsName is the DNS name of the network interface with NetworkCardIndex 0 and DeviceIndex 0. The problem was showing up when using `SlurmSettings/Dns/UseEc2Hostnames` equals to `True`. Signed-off-by: Luca Carrogu --- CHANGELOG.md | 2 + src/common/ec2_utils.py | 16 ++-- src/slurm_plugin/fleet_manager.py | 7 +- src/slurm_plugin/instance_manager.py | 7 +- tests/common/test_ec2_utils.py | 134 +++++++++++++++++++++++++++ 5 files changed, 153 insertions(+), 13 deletions(-) create mode 100644 tests/common/test_ec2_utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 249c44bde..2b60e8e86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ This file is used to list changes made in each version of the aws-parallelcluste - Make `aws-parallelcluster-node` daemons handle only ParallelCluster-managed Slurm partitions. **BUG FIXES** +- Fix an issue that was causing misalignment of compute nodes DNS name on instances with multiple network interfaces, + when using `SlurmSettings/Dns/UseEc2Hostnames` equals to `True`. 3.6.1 ------ diff --git a/src/common/ec2_utils.py b/src/common/ec2_utils.py index 9c21a48b1..f443018ab 100644 --- a/src/common/ec2_utils.py +++ b/src/common/ec2_utils.py @@ -11,19 +11,21 @@ # See the License for the specific language governing permissions and limitations under the License. -def get_private_ip_address(instance_info): +def get_private_ip_address_and_dns_name(instance_info): """ - Return the PrivateIpAddress of the EC2 instance. + Return the PrivateIpAddress and PrivateDnsName of the EC2 instance. - The PrivateIpAddress is considered to be the one for the + The PrivateIpAddress and PrivateDnsName are considered to be the ones for the network interface with DeviceIndex = NetworkCardIndex = 0. :param instance_info: the dictionary returned by a EC2:DescribeInstances call. - :return: the PrivateIpAddress of the instance. + :return: the PrivateIpAddress and PrivateDnsName of the instance. """ private_ip = instance_info["PrivateIpAddress"] + private_dns_name = instance_info["PrivateDnsName"] for network_interface in instance_info["NetworkInterfaces"]: attachment = network_interface["Attachment"] - if attachment["DeviceIndex"] == 0 and attachment["NetworkCardIndex"] == 0: - private_ip = network_interface["PrivateIpAddress"] + if attachment.get("DeviceIndex", -1) == 0 and attachment.get("NetworkCardIndex", -1) == 0: + private_ip = network_interface.get("PrivateIpAddress", private_ip) + private_dns_name = network_interface.get("PrivateDnsName", private_dns_name) break - return private_ip + return private_ip, private_dns_name diff --git a/src/slurm_plugin/fleet_manager.py b/src/slurm_plugin/fleet_manager.py index 2d879336b..4d99db370 100644 --- a/src/slurm_plugin/fleet_manager.py +++ b/src/slurm_plugin/fleet_manager.py @@ -16,7 +16,7 @@ import boto3 from botocore.exceptions import ClientError -from common.ec2_utils import get_private_ip_address +from common.ec2_utils import get_private_ip_address_and_dns_name from common.utils import setup_logging_filter logger = logging.getLogger(__name__) @@ -50,10 +50,11 @@ def __hash__(self): @staticmethod def from_describe_instance_data(instance_info): try: + private_ip, private_dns_name = get_private_ip_address_and_dns_name(instance_info) return EC2Instance( instance_info["InstanceId"], - get_private_ip_address(instance_info), - instance_info["PrivateDnsName"].split(".")[0], + private_ip, + private_dns_name.split(".")[0], instance_info["LaunchTime"], ) except KeyError as e: diff --git a/src/slurm_plugin/instance_manager.py b/src/slurm_plugin/instance_manager.py index 024c97d4e..56b8c2021 100644 --- a/src/slurm_plugin/instance_manager.py +++ b/src/slurm_plugin/instance_manager.py @@ -23,7 +23,7 @@ import boto3 from botocore.config import Config from botocore.exceptions import ClientError -from common.ec2_utils import get_private_ip_address +from common.ec2_utils import get_private_ip_address_and_dns_name from common.schedulers.slurm_commands import get_nodes_info, update_nodes from common.utils import grouper, setup_logging_filter from slurm_plugin.common import ComputeInstanceDescriptor, log_exception, print_with_count @@ -436,11 +436,12 @@ def get_cluster_instances(self, include_head_node=False, alive_states_only=True) instances = [] for instance_info in filtered_iterator: try: + private_ip, private_dns_name = get_private_ip_address_and_dns_name(instance_info) instances.append( EC2Instance( instance_info["InstanceId"], - get_private_ip_address(instance_info), - instance_info["PrivateDnsName"].split(".")[0], + private_ip, + private_dns_name.split(".")[0], instance_info["LaunchTime"], ) ) diff --git a/tests/common/test_ec2_utils.py b/tests/common/test_ec2_utils.py new file mode 100644 index 000000000..fd9a27d97 --- /dev/null +++ b/tests/common/test_ec2_utils.py @@ -0,0 +1,134 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import pytest +from assertpy import assert_that +from common.ec2_utils import get_private_ip_address_and_dns_name + + +@pytest.mark.parametrize( + "instance_info, expected_private_ip, expected_private_dns_name", + [ + ( + { + "InstanceId": "i-12345", + "InstanceType": "c5.xlarge", + "PrivateIpAddress": "ip.1.0.0.1", + "PrivateDnsName": "ip-1-0-0-1", + "NetworkInterfaces": [ + { + "Attachment": { + "DeviceIndex": 0, + "NetworkCardIndex": 0, + }, + "PrivateIpAddress": "ip.1.0.0.1", + "PrivateDnsName": "ip-1-0-0-1", + }, + ], + }, + "ip.1.0.0.1", + "ip-1-0-0-1", + ), + ( + { + "InstanceId": "i-12345", + "InstanceType": "c5.xlarge", + "PrivateIpAddress": "ip.1.0.0.1", + "PrivateDnsName": "ip-1-0-0-1", + "NetworkInterfaces": [ + { + "Attachment": { + "DeviceIndex": 0, + "NetworkCardIndex": 0, + }, + }, + ], + }, + "ip.1.0.0.1", + "ip-1-0-0-1", + ), + ( + { + "InstanceId": "i-12345", + "InstanceType": "c5.xlarge", + "PrivateIpAddress": "ip.1.0.0.1", + "PrivateDnsName": "ip-1-0-0-1", + "NetworkInterfaces": [ + { + "Attachment": {}, + }, + ], + }, + "ip.1.0.0.1", + "ip-1-0-0-1", + ), + ( + { + "InstanceId": "i-12345", + "InstanceType": "c5.xlarge", + "PrivateIpAddress": "ip.1.0.0.1", + "PrivateDnsName": "ip-1-0-0-1", + "NetworkInterfaces": [ + { + "Attachment": { + "DeviceIndex": 0, + "NetworkCardIndex": 1, + }, + "PrivateIpAddress": "ip.1.0.0.1", + "PrivateDnsName": "ip-1-0-0-1", + }, + { + "Attachment": { + "DeviceIndex": 0, + "NetworkCardIndex": 0, + }, + "PrivateIpAddress": "ip.1.0.0.2", + "PrivateDnsName": "ip-1-0-0-2", + }, + ], + }, + "ip.1.0.0.2", + "ip-1-0-0-2", + ), + ( + { + "InstanceId": "i-12345", + "InstanceType": "c5.xlarge", + "PrivateIpAddress": "ip.1.0.0.1", + "PrivateDnsName": "ip-1-0-0-1", + "NetworkInterfaces": [ + { + "Attachment": { + "DeviceIndex": 0, + "NetworkCardIndex": 0, + }, + "PrivateIpAddress": "ip.1.0.0.1", + "PrivateDnsName": "ip-1-0-0-1", + }, + { + "Attachment": { + "DeviceIndex": 0, + "NetworkCardIndex": 1, + }, + "PrivateIpAddress": "ip.1.0.0.2", + "PrivateDnsName": "ip-1-0-0-2", + }, + ], + }, + "ip.1.0.0.1", + "ip-1-0-0-1", + ), + ], +) +def test_get_private_ip_address_and_dns_name(mocker, instance_info, expected_private_ip, expected_private_dns_name): + actual_private_ip, actual_private_dns_name = get_private_ip_address_and_dns_name(instance_info) + assert_that(actual_private_ip).is_equal_to(expected_private_ip) + assert_that(actual_private_dns_name).is_equal_to(expected_private_dns_name)