Skip to content

Commit

Permalink
Fix DNS name retrieval on Multi NICs instance
Browse files Browse the repository at this point in the history
Fix the DNS name retrieval on Multi NICs instance, where the PrivateDnsName
is the DNS name of the network interface with NetworkCardIndex 0 and DeviceIndex 0.

The problem was showing up when using `SlurmSettings/Dns/UseEc2Hostnames` equals to `True`.

Signed-off-by: Luca Carrogu <carrogu@amazon.com>
  • Loading branch information
lukeseawalker committed Aug 28, 2023
1 parent 0ad03a1 commit 6adb559
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 13 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ This file is used to list changes made in each version of the aws-parallelcluste
- Make `aws-parallelcluster-node` daemons handle only ParallelCluster-managed Slurm partitions.

**BUG FIXES**
- Fix an issue that was causing misalignment of compute nodes DNS name on instances with multiple network interfaces,
when using `SlurmSettings/Dns/UseEc2Hostnames` equals to `True`.

3.6.1
------
Expand Down
16 changes: 9 additions & 7 deletions src/common/ec2_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,21 @@
# See the License for the specific language governing permissions and limitations under the License.


def get_private_ip_address(instance_info):
def get_private_ip_address_and_dns_name(instance_info):
"""
Return the PrivateIpAddress of the EC2 instance.
Return the PrivateIpAddress and PrivateDnsName of the EC2 instance.
The PrivateIpAddress is considered to be the one for the
The PrivateIpAddress and PrivateDnsName are considered to be the ones for the
network interface with DeviceIndex = NetworkCardIndex = 0.
:param instance_info: the dictionary returned by a EC2:DescribeInstances call.
:return: the PrivateIpAddress of the instance.
:return: the PrivateIpAddress and PrivateDnsName of the instance.
"""
private_ip = instance_info["PrivateIpAddress"]
private_dns_name = instance_info["PrivateDnsName"]
for network_interface in instance_info["NetworkInterfaces"]:
attachment = network_interface["Attachment"]
if attachment["DeviceIndex"] == 0 and attachment["NetworkCardIndex"] == 0:
private_ip = network_interface["PrivateIpAddress"]
if attachment.get("DeviceIndex", -1) == 0 and attachment.get("NetworkCardIndex", -1) == 0:
private_ip = network_interface.get("PrivateIpAddress", private_ip)
private_dns_name = network_interface.get("PrivateDnsName", private_dns_name)
break
return private_ip
return private_ip, private_dns_name
7 changes: 4 additions & 3 deletions src/slurm_plugin/fleet_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import boto3
from botocore.exceptions import ClientError
from common.ec2_utils import get_private_ip_address
from common.ec2_utils import get_private_ip_address_and_dns_name
from common.utils import setup_logging_filter

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -50,10 +50,11 @@ def __hash__(self):
@staticmethod
def from_describe_instance_data(instance_info):
try:
private_ip, private_dns_name = get_private_ip_address_and_dns_name(instance_info)
return EC2Instance(
instance_info["InstanceId"],
get_private_ip_address(instance_info),
instance_info["PrivateDnsName"].split(".")[0],
private_ip,
private_dns_name.split(".")[0],
instance_info["LaunchTime"],
)
except KeyError as e:
Expand Down
7 changes: 4 additions & 3 deletions src/slurm_plugin/instance_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError
from common.ec2_utils import get_private_ip_address
from common.ec2_utils import get_private_ip_address_and_dns_name
from common.schedulers.slurm_commands import get_nodes_info, update_nodes
from common.utils import grouper, setup_logging_filter
from slurm_plugin.common import ComputeInstanceDescriptor, log_exception, print_with_count
Expand Down Expand Up @@ -436,11 +436,12 @@ def get_cluster_instances(self, include_head_node=False, alive_states_only=True)
instances = []
for instance_info in filtered_iterator:
try:
private_ip, private_dns_name = get_private_ip_address_and_dns_name(instance_info)
instances.append(
EC2Instance(
instance_info["InstanceId"],
get_private_ip_address(instance_info),
instance_info["PrivateDnsName"].split(".")[0],
private_ip,
private_dns_name.split(".")[0],
instance_info["LaunchTime"],
)
)
Expand Down
134 changes: 134 additions & 0 deletions tests/common/test_ec2_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import pytest
from assertpy import assert_that
from common.ec2_utils import get_private_ip_address_and_dns_name


@pytest.mark.parametrize(
"instance_info, expected_private_ip, expected_private_dns_name",
[
(
{
"InstanceId": "i-12345",
"InstanceType": "c5.xlarge",
"PrivateIpAddress": "ip.1.0.0.1",
"PrivateDnsName": "ip-1-0-0-1",
"NetworkInterfaces": [
{
"Attachment": {
"DeviceIndex": 0,
"NetworkCardIndex": 0,
},
"PrivateIpAddress": "ip.1.0.0.1",
"PrivateDnsName": "ip-1-0-0-1",
},
],
},
"ip.1.0.0.1",
"ip-1-0-0-1",
),
(
{
"InstanceId": "i-12345",
"InstanceType": "c5.xlarge",
"PrivateIpAddress": "ip.1.0.0.1",
"PrivateDnsName": "ip-1-0-0-1",
"NetworkInterfaces": [
{
"Attachment": {
"DeviceIndex": 0,
"NetworkCardIndex": 0,
},
},
],
},
"ip.1.0.0.1",
"ip-1-0-0-1",
),
(
{
"InstanceId": "i-12345",
"InstanceType": "c5.xlarge",
"PrivateIpAddress": "ip.1.0.0.1",
"PrivateDnsName": "ip-1-0-0-1",
"NetworkInterfaces": [
{
"Attachment": {},
},
],
},
"ip.1.0.0.1",
"ip-1-0-0-1",
),
(
{
"InstanceId": "i-12345",
"InstanceType": "c5.xlarge",
"PrivateIpAddress": "ip.1.0.0.1",
"PrivateDnsName": "ip-1-0-0-1",
"NetworkInterfaces": [
{
"Attachment": {
"DeviceIndex": 0,
"NetworkCardIndex": 1,
},
"PrivateIpAddress": "ip.1.0.0.1",
"PrivateDnsName": "ip-1-0-0-1",
},
{
"Attachment": {
"DeviceIndex": 0,
"NetworkCardIndex": 0,
},
"PrivateIpAddress": "ip.1.0.0.2",
"PrivateDnsName": "ip-1-0-0-2",
},
],
},
"ip.1.0.0.2",
"ip-1-0-0-2",
),
(
{
"InstanceId": "i-12345",
"InstanceType": "c5.xlarge",
"PrivateIpAddress": "ip.1.0.0.1",
"PrivateDnsName": "ip-1-0-0-1",
"NetworkInterfaces": [
{
"Attachment": {
"DeviceIndex": 0,
"NetworkCardIndex": 0,
},
"PrivateIpAddress": "ip.1.0.0.1",
"PrivateDnsName": "ip-1-0-0-1",
},
{
"Attachment": {
"DeviceIndex": 0,
"NetworkCardIndex": 1,
},
"PrivateIpAddress": "ip.1.0.0.2",
"PrivateDnsName": "ip-1-0-0-2",
},
],
},
"ip.1.0.0.1",
"ip-1-0-0-1",
),
],
)
def test_get_private_ip_address_and_dns_name(mocker, instance_info, expected_private_ip, expected_private_dns_name):
actual_private_ip, actual_private_dns_name = get_private_ip_address_and_dns_name(instance_info)
assert_that(actual_private_ip).is_equal_to(expected_private_ip)
assert_that(actual_private_dns_name).is_equal_to(expected_private_dns_name)

0 comments on commit 6adb559

Please sign in to comment.