Skip to content

Commit

Permalink
[LibOS,PAL] Support EINPROGRESS on non-blocking sockets connect
Browse files Browse the repository at this point in the history
Previously, Gramine transformed `connect()` of non-blocking sockets into
a blocking operation and thus never returned -EINPROGRESS. This led to
the connect operation being very slow (waiting for a host timeout) if a
remote peer is unresponsive.

This commit fixes this and adds a LibOS regression test; unfortunately
the test must be run manually with special network settings to properly
test the EINPROGRESS behavior.

Signed-off-by: Dmitrii Kuvaiskii <dmitrii.kuvaiskii@intel.com>
  • Loading branch information
Dmitrii Kuvaiskii committed Nov 7, 2023
1 parent 24a581c commit 144916b
Show file tree
Hide file tree
Showing 20 changed files with 268 additions and 89 deletions.
1 change: 1 addition & 0 deletions common/include/pal_error.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ typedef enum _pal_error_t {
PAL_ERROR_NOTSERVER,
PAL_ERROR_NOTCONNECTION,
PAL_ERROR_CONNFAILED,
PAL_ERROR_INPROGRESS,
PAL_ERROR_ADDRNOTEXIST,
PAL_ERROR_AFNOSUPPORT,
PAL_ERROR_CONNFAILED_PIPE,
Expand Down
1 change: 1 addition & 0 deletions common/src/pal_error.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ static const char* g_pal_error_list[] = {
[PAL_ERROR_NOTSERVER] = "Not a server (PAL_ERROR_NOTSERVER)",
[PAL_ERROR_NOTCONNECTION] = "Not a connection (PAL_ERROR_NOTCONNECTION)",
[PAL_ERROR_CONNFAILED] = "Connection failed (PAL_ERROR_CONNFAILED)",
[PAL_ERROR_INPROGRESS] = "Operation now in progress (PAL_ERROR_INPROGRESS)",
[PAL_ERROR_ADDRNOTEXIST] = "Resource address does not exist (PAL_ERROR_ADDRNOTEXIST)",
[PAL_ERROR_AFNOSUPPORT] = "Address family not supported by protocol (PAL_ERROR_AFNOSUPPORT)",
[PAL_ERROR_CONNFAILED_PIPE] = "Broken pipe (PAL_ERROR_CONNFAILED_PIPE)",
Expand Down
1 change: 1 addition & 0 deletions libos/include/libos_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ struct libos_pipe_handle {
enum libos_sock_state {
SOCK_NEW,
SOCK_BOUND,
SOCK_CONNECTING,
SOCK_CONNECTED,
SOCK_LISTENING,
};
Expand Down
1 change: 1 addition & 0 deletions libos/src/libos_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ static unsigned pal_errno_to_unix_errno_table[PAL_ERROR_NATIVE_COUNT + 1] = {
[PAL_ERROR_NOTSERVER] = EINVAL,
[PAL_ERROR_NOTCONNECTION] = ENOTCONN,
[PAL_ERROR_CONNFAILED] = ECONNRESET,
[PAL_ERROR_INPROGRESS] = EINPROGRESS,
[PAL_ERROR_ADDRNOTEXIST] = EADDRNOTAVAIL,
[PAL_ERROR_AFNOSUPPORT] = EAFNOSUPPORT,
[PAL_ERROR_CONNFAILED_PIPE] = EPIPE,
Expand Down
16 changes: 12 additions & 4 deletions libos/src/net/ip.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,21 +221,29 @@ static int connect(struct libos_handle* handle, void* addr, size_t addrlen) {
linux_to_pal_sockaddr(addr, &pal_remote_addr);
struct pal_socket_addr pal_local_addr;

/* XXX: this connect is always blocking (regardless of actual setting of nonblockingness on
* `sock->pal_handle`. See also the comment in tcp connect implementation in Linux PAL. */
ret = PalSocketConnect(sock->pal_handle, &pal_remote_addr, &pal_local_addr);
if (ret < 0) {
return ret == -PAL_ERROR_CONNFAILED ? -ECONNREFUSED : pal_to_unix_errno(ret);
if (ret == -PAL_ERROR_CONNFAILED) {
/* can't use pal_to_unix_errno() because it translates to ECONNRESET */
return -ECONNREFUSED;
} else if (ret == -PAL_ERROR_INPROGRESS) {
assert(handle->flags & O_NONBLOCK);
ret = pal_to_unix_errno(ret);
goto out;
}
return pal_to_unix_errno(ret);
}

ret = 0;
out:
memcpy(&sock->remote_addr, addr, addrlen);
sock->remote_addrlen = addrlen;
if (sock->state != SOCK_BOUND) {
assert(sock->state == SOCK_NEW);
assert(!sock->was_bound);
pal_to_linux_sockaddr(&pal_local_addr, &sock->local_addr, &sock->local_addrlen);
}
return 0;
return ret;
}

static int disconnect(struct libos_handle* handle) {
Expand Down
35 changes: 35 additions & 0 deletions libos/src/sys/libos_epoll.c
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,41 @@ static int do_epoll_wait(int epfd, struct epoll_event* events, int maxevents, in
this_item_events |= items[i]->events & (EPOLLOUT | EPOLLWRNORM);
}

if (pal_ret_events[i] & PAL_WAIT_WRITE && items[i]->handle->type == TYPE_SOCK) {
/*
* Special case of a non-blocking socket that is INPROGRESS (connecting): must check
* if error or success of connecting. If error, then set SO_ERROR (last_error). If
* success, then move to SOCK_CONNECTED state and clear SO_ERROR.
*
* This is only relevant if EPOLLOUT event was requested.
*
* We first fetch the state atomically instead of a proper lock on the handle to
* speed up the common case of an already-connected socket doing recv/send.
*
* See similar case in libos_poll.c:do_poll().
*/
enum libos_sock_state state = __atomic_load_n(&items[i]->handle->info.sock.state,
__ATOMIC_ACQUIRE);
if (state == SOCK_CONNECTING) {
struct libos_sock_handle* sock = &items[i]->handle->info.sock;
lock(&sock->lock);
if (sock->state != SOCK_CONNECTING) {
/* theoretically, another thread could be doing another epoll on this socket
* and modify the state; we don't support this unlikely case */
BUG();
}
if (pal_ret_events[i] & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP)) {
sock->last_error = ECONNREFUSED;
} else {
sock->last_error = 0;
sock->state = SOCK_CONNECTED;
sock->can_be_read = true;
sock->can_be_written = true;
}
unlock(&sock->lock);
}
}

if (!this_item_events) {
/* This handle is not interested in events that were detected - epoll item was
* probably updated asynchronously. */
Expand Down
35 changes: 35 additions & 0 deletions libos/src/sys/libos_poll.c
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,41 @@ static long do_poll(struct pollfd* fds, size_t fds_len, uint64_t* timeout_us) {
if (ret_events[i] & PAL_WAIT_WRITE)
fds[i].revents |= fds[i].events & (POLLOUT | POLLWRNORM);

if (ret_events[i] & PAL_WAIT_WRITE && libos_handles[i]->type == TYPE_SOCK) {
/*
* Special case of a non-blocking socket that is INPROGRESS (connecting): must check if
* error or success of connecting. If error, then set SO_ERROR (last_error). If success,
* then move to SOCK_CONNECTED state and clear SO_ERROR.
*
* This is only relevant if POLLOUT event was requested.
*
* We first fetch the state atomically instead of a proper lock on the handle to speed
* up the common case of an already-connected socket doing recv/send.
*
* See similar case in libos_epoll.c:do_epoll_wait().
*/
enum libos_sock_state state = __atomic_load_n(&libos_handles[i]->info.sock.state,
__ATOMIC_ACQUIRE);
if (state == SOCK_CONNECTING) {
struct libos_sock_handle* sock = &libos_handles[i]->info.sock;
lock(&sock->lock);
if (sock->state != SOCK_CONNECTING) {
/* theoretically, another thread could be doing another select/poll on this
* socket and modify the state; we don't support this unlikely case */
BUG();
}
if (ret_events[i] & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP)) {
sock->last_error = ECONNREFUSED;
} else {
sock->last_error = 0;
sock->state = SOCK_CONNECTED;
sock->can_be_read = true;
sock->can_be_written = true;
}
unlock(&sock->lock);
}
}

if (fds[i].revents)
ret_events_count++;
}
Expand Down
45 changes: 28 additions & 17 deletions libos/src/sys/libos_socket.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,26 @@
#include "linux_abi/errors.h"

/*
* Sockets can be in 4 states: NEW, BOUND, LISTENING and CONNECTED.
* Sockets can be in 5 states: NEW, BOUND, LISTENING, CONNECTING and CONNECTED.
*
* +------------------+
* | |
* | |
* bind() listen() V accept() old socket
* +--> NEW --------------------> BOUND -------------> LISTEN --------------+
* | | | ^ new socket
* | | | | |
* | | | | |
* | | connect() | | disconnect() |
* | | | | (if it was bound) |
* | | connect() | | |
* | | | | |
* | | V | |
* | +---------------------> CONNECTED <--------------------------------+
* | |
* | disconnect() |
* | (if it was not bound) |
* +------------------------------+
* +--> NEW +-------------------> BOUND +------------> LISTEN +-------------+
* | + + ^ new socket
* | | | | +
* | | | +------------------------+ |
* | | connect() | disconnect() | |
* | | | (if it was bound) | |
* | | connect() | | |
* | | + select()/poll()/ | |
* | | V epoll() + |
* | +---------------------> CONNECTING ---------------- CONNECTED <----+
* | +
* | disconnect() |
* | (if it was not bound) |
* +-----------------------------------------------------------+
*
*/

Expand Down Expand Up @@ -491,13 +491,20 @@ long libos_syscall_connect(int fd, void* addr, int _addrlen) {
switch (sock->state) {
case SOCK_NEW:
case SOCK_BOUND:
case SOCK_CONNECTING:
case SOCK_CONNECTED:
break;
default:
ret = -EINVAL;
goto out;
}

if (sock->state == SOCK_CONNECTING) {
assert(handle->flags & O_NONBLOCK);
ret = -EALREADY;
goto out;
}

if (sock->state == SOCK_CONNECTED) {
unsigned short addr_family;
if (addrlen < sizeof(addr_family)) {
Expand Down Expand Up @@ -542,6 +549,10 @@ long libos_syscall_connect(int fd, void* addr, int _addrlen) {
ret = sock->ops->connect(handle, addr, addrlen);
maybe_epoll_et_trigger(handle, ret, /*in=*/false, /*was_partial=*/false);
if (ret < 0) {
if (ret == -EINPROGRESS) {
sock->state = SOCK_CONNECTING;
sock->last_error = -ret;
}
goto out;
}

Expand Down Expand Up @@ -638,7 +649,7 @@ ssize_t do_sendmsg(struct libos_handle* handle, struct iovec* iov, size_t iov_le
lock(&sock->lock);
bool has_sendtimeout_set = !!sock->sendtimeout_us;

ret = -sock->last_error;
ret = -((ssize_t)sock->last_error);
sock->last_error = 0;

if (!ret && !sock->can_be_written) {
Expand Down Expand Up @@ -801,7 +812,7 @@ ssize_t do_recvmsg(struct libos_handle* handle, struct iovec* iov, size_t iov_le

lock(&sock->lock);
bool has_recvtimeout_set = !!sock->receivetimeout_us;
ret = -sock->last_error;
ret = -((ssize_t)sock->last_error);
sock->last_error = 0;
unlock(&sock->lock);

Expand Down
1 change: 1 addition & 0 deletions libos/test/regression/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ tests = {
'syscall_restart': {},
'sysfs_common': {},
'tcp_ancillary': {},
'tcp_einprogress': {},
'tcp_ipv6_v6only': {},
'tcp_msg_peek': {},
'udp': {},
Expand Down
105 changes: 105 additions & 0 deletions libos/test/regression/tcp_einprogress.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <unistd.h>

#include "common.h"

#define ERR(msg, args...) \
errx(1, "%d: " msg, __LINE__, ##args)

#define SRV_IP "127.0.0.1"
#define PORT 12345 /* nothing must be bound to this port! */

int main(void) {
int ret;

int s = CHECK(socket(AF_INET, SOCK_STREAM, 0));

int flags = CHECK(fcntl(s, F_GETFL, 0));
CHECK(fcntl(s, F_SETFL, flags | O_NONBLOCK));

struct sockaddr_in sa = {
.sin_family = AF_INET,
.sin_port = htons(PORT),
};
if (inet_aton(SRV_IP, &sa.sin_addr) != 1)
ERR("inet_aton failed");

ret = connect(s, (void*)&sa, sizeof(sa));
if (!ret)
ERR("connect unexpectedly succeeded (expected EINPROGRESS or ECONNREFUSED)");

if (ret < 0 && errno != EINPROGRESS) {
if (errno != ECONNREFUSED)
ERR("expected connect to fail with ECONNREFUSED but failed with %s", strerror(errno));

/* boring case without EINPROGRESS (aka blocking connect) */
puts("TEST OK (no EINPROGRESS)");
CHECK(close(s));
return 0;
}

struct sockaddr_in sa_local;
socklen_t addrlen_local = sizeof(sa_local);
ret = getsockname(s, (struct sockaddr*)&sa_local, &addrlen_local);
if (ret < 0)
ERR("[after EINPROGRESS] getsockname failed with %s", strerror(errno));
printf("local address %s:%hu\n", inet_ntoa(sa_local.sin_addr), ntohs(sa_local.sin_port));
fflush(stdout);

ret = connect(s, (void*)&sa, sizeof(sa));
if (ret != -1 || errno != EALREADY) {
if (errno == ECONNREFUSED) {
/* boring case with EINPROGRESS but a quick response */
puts("TEST OK (quick response)");
CHECK(close(s));
return 0;
}
ERR("[after EINPROGRESS] expected second connect to fail with EALREADY but failed with %s",
strerror(errno));
}

struct sockaddr_in sa_peer;
socklen_t addrlen_peer = sizeof(sa_peer);
ret = getpeername(s, (struct sockaddr*)&sa_peer, &addrlen_peer);
if (ret != -1 || errno != ENOTCONN)
ERR("[after EINPROGRESS] expected getpeername to fail with ENOTCONN but failed with %s",
strerror(errno));

struct pollfd infds[] = {
{.fd = s, .events = POLLOUT},
};
ret = CHECK(poll(infds, 1, /*timout_ms=*/10000));
if (ret == 0) {
/* one interesting case -- remote peer is completely unresponsive */
puts("TEST OK (connection timed out)");
CHECK(close(s));
return 0;
}

/* the most interesting case -- remote peer not unresponsive but very slow */
if (!(infds[0].revents & POLLOUT)) {
ERR("polling didn't return POLLOUT on connecting socket");
}

int so_error;
socklen_t optlen = sizeof(so_error);
CHECK(getsockopt(s, SOL_SOCKET, SO_ERROR, &so_error, &optlen));
if (optlen != sizeof(so_error) || so_error != ECONNREFUSED) {
ERR("[after EINPROGRESS] expected SO_ERROR to be ECONNREFUSED but it is %s",
strerror(so_error));
}

puts("TEST OK (connection refused after initial EINPROGRESS)");
CHECK(close(s));
return 0;
}
7 changes: 7 additions & 0 deletions libos/test/regression/test_libos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1411,6 +1411,13 @@ def test_301_socket_tcp_ancillary(self):
stdout, _ = self.run_binary(['tcp_ancillary'])
self.assertIn('TEST OK', stdout)

def test_302_socket_tcp_einprogress(self):
# To really test EINPROGRESS, must emulate an unresponsive peer. This is not possible to
# achieve in a simple automated manner. It is recommended to run this test manually, with
# peer emulation as described in https://github.com/gramineproject/gramine/issues/1641.
stdout, _ = self.run_binary(['tcp_einprogress'])
self.assertIn('TEST OK', stdout)

def test_310_socket_tcp_ipv6_v6only(self):
stdout, _ = self.run_binary(['tcp_ipv6_v6only'], timeout=50)
self.assertIn('test completed successfully', stdout)
Expand Down
1 change: 1 addition & 0 deletions libos/test/regression/tests.toml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ manifests = [
"syscall_restart",
"sysfs_common",
"tcp_ancillary",
"tcp_einprogress",
"tcp_ipv6_v6only",
"tcp_msg_peek",
"toml_parsing",
Expand Down
1 change: 1 addition & 0 deletions libos/test/regression/tests_musl.toml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ manifests = [
"syscall_restart",
"sysfs_common",
"tcp_ancillary",
"tcp_einprogress",
"tcp_ipv6_v6only",
"tcp_msg_peek",
"toml_parsing",
Expand Down
4 changes: 2 additions & 2 deletions pal/include/pal/pal.h
Original file line number Diff line number Diff line change
Expand Up @@ -593,8 +593,8 @@ int PalSocketAccept(PAL_HANDLE handle, pal_stream_options_t options, PAL_HANDLE*
*
* \param handle Handle to the socket.
* \param addr Address to connect to.
* \param[out] out_local_addr On success contains the local address of the socket.
* Can be NULL, to ignore the result.
* \param[out] out_local_addr On success (or in special-case of PAL_ERROR_INPROGRESS) contains the
* local address of the socket. Can be NULL, to ignore the result.
*
* \returns 0 on success, negative error code on failure.
*
Expand Down
Loading

0 comments on commit 144916b

Please sign in to comment.