Skip to content

Commit

Permalink
prov/efa: differentiate unresponsive receiver errors following rdma-core
Browse files Browse the repository at this point in the history
Add a new vendor error code EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE
from rdma core to indicate the remote is unreachable.
Add a new EFA provider error code UNESTABLISHED_RECV_UNRESP to distinguish
unresponsive receiver error when the peer is reachable by the EFA device
but libfabric failed to complete a handshake.
Add unit test for EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE.

Signed-off-by: Jessie Yang <jiaxiyan@amazon.com>
  • Loading branch information
jiaxiyan committed Sep 25, 2024
1 parent e3c12ec commit 443a7ef
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 12 deletions.
12 changes: 8 additions & 4 deletions prov/efa/src/efa_errno.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,9 @@
_(10, REMOTE_ERROR_RNR, Destination resource not ready (no work queue entries posted on receive queue)) \
_(11, REMOTE_ERROR_BAD_LENGTH, Remote scatter-gather list too short) \
_(12, REMOTE_ERROR_BAD_STATUS, Unexpected status returned by responder) \
_(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (detected locally)) \
_(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations))
_(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (was previously responsive)) \
_(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations)) \
_(15, LOCAL_ERROR_UNREACH_REMOTE, Unreachable remote (never received a response))

/**
* @brief EFA provider proprietary error codes
Expand Down Expand Up @@ -105,7 +106,8 @@
_(4122, SHM_INTERNAL_ERROR, SHM internal error) \
_(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \
_(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) \
_(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON)
_(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON) \
_(4126, UNESTABLISHED_RECV_UNRESP, Unresponsive receiver (reachable by EFA device but handshake failed))

/** @} */

Expand Down Expand Up @@ -156,13 +158,15 @@ static inline int to_fi_errno(enum efa_errno err) {
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNSUPPORTED_OP:
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS:
return FI_EINVAL;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE:
return FI_EHOSTUNREACH;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH:
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH:
return FI_EMSGSIZE;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP:
case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP:
return FI_ECONNABORTED;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN:
case EFA_IO_COMP_STATUS_REMOTE_ERROR_UNKNOWN_PEER:
Expand Down
16 changes: 11 additions & 5 deletions prov/efa/src/efa_strerror.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ void efa_show_help(enum efa_errno err) {
help = "This error is detected remotely; "
"typically encountered when the peer process is no longer present";
break;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE:
help = "This error is detected locally. "
"The connection status is unknown or was never established via "
"handshake. This typically indicates one or more misconfigured "
"The peer is not reachable by the EFA device. "
"This typically indicates one or more misconfigured "
"EC2 instances; most often due to incorrect inbound/outbound "
"security group rules and/or instances placed in different "
"subnets. Refer to the public AWS documentation for EFA for "
Expand All @@ -80,8 +80,14 @@ void efa_show_help(enum efa_errno err) {
case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP:
help = "This error is detected locally. "
"The connection was previously established via handshake, "
"which indicates the error is likely due to the peer process no "
"longer being present.";
"which indicates the error is likely due to a hardware failure "
"on the remote peer, or the peer process no longer being present.";
break;
case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP:
help = "This error is detected locally. "
"The peer is reachable by the EFA device but libfabric failed "
"to complete a handshake, which indicates the error is likely "
"due to the peer process no longer being present.";
break;
case FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX:
help = "This error is detected locally. "
Expand Down
9 changes: 6 additions & 3 deletions prov/efa/src/rdm/efa_rdm_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,9 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct
*
* @todo Currently, this only checks for unresponsive receiver
* (#EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE) and attempts to promote it to
* #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP. This should be expanded to handle other
* #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP if a handshake was made, or
* #FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP if the handshake failed.
* This should be expanded to handle other
* RDMA Core error codes (#EFA_IO_COMP_STATUSES) for the sake of more accurate
* error reporting
*/
Expand All @@ -418,8 +420,9 @@ static int efa_rdm_cq_get_prov_errno(struct ibv_cq_ex *ibv_cq_ex) {

switch (vendor_err) {
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: {
if (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED)
vendor_err = FI_EFA_ERR_ESTABLISHED_RECV_UNRESP;
vendor_err = (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) ?
FI_EFA_ERR_ESTABLISHED_RECV_UNRESP :
FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP;
break;
}
default:
Expand Down
17 changes: 17 additions & 0 deletions prov/efa/test/efa_unit_test_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,23 @@ void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id
EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE);
}

/**
* @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns
* unreachable remote error for send.
*
* When send operation failed, fi_cq_read() should return -FI_EAVAIL, which means error available.
* then user should call fi_cq_readerr() to get an error CQ entry that contain error code.
*
* @param[in] state struct efa_resource that is managed by the framework
*/
void test_rdm_cq_read_bad_send_status_unreachable_receiver(struct efa_resource **state)
{
struct efa_resource *resource = *state;
test_rdm_cq_read_bad_send_status(resource,
0x1234567812345678, 0x8765432187654321,
EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE);
}

/**
* @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns
* invalid qpn error for send.
Expand Down
1 change: 1 addition & 0 deletions prov/efa/test/efa_unit_tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ int main(void)
cmocka_unit_test_setup_teardown(test_rdm_cq_create_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unreachable_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_invalid_qpn, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_message_too_long, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_bad_recv_status, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
Expand Down
1 change: 1 addition & 0 deletions prov/efa/test/efa_unit_tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ void test_ibv_cq_ex_read_failed_poll();
void test_rdm_cq_create_error_handling();
void test_rdm_cq_read_bad_send_status_unresponsive_receiver();
void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id();
void test_rdm_cq_read_bad_send_status_unreachable_receiver();
void test_rdm_cq_read_bad_send_status_invalid_qpn();
void test_rdm_cq_read_bad_send_status_message_too_long();
void test_ibv_cq_ex_read_bad_recv_status();
Expand Down

0 comments on commit 443a7ef

Please sign in to comment.