Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

upstream: Null-deref on TCP health checker if setsockopt fails #6793

Merged
merged 13 commits into from
May 10, 2019
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions source/common/upstream/health_checker_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ TcpHealthCheckerImpl::TcpActiveHealthCheckSession::~TcpActiveHealthCheckSession(

void TcpHealthCheckerImpl::TcpActiveHealthCheckSession::onDeferredDelete() {
if (client_) {
expect_reset_ = true;
client_->close(Network::ConnectionCloseType::NoFlush);
}
}
Expand All @@ -371,6 +372,7 @@ void TcpHealthCheckerImpl::TcpActiveHealthCheckSession::onData(Buffer::Instance&
data.drain(data.length());
handleSuccess(false);
if (!parent_.reuse_connection_) {
expect_reset_ = true;
client_->close(Network::ConnectionCloseType::NoFlush);
}
} else {
Expand All @@ -379,12 +381,11 @@ void TcpHealthCheckerImpl::TcpActiveHealthCheckSession::onData(Buffer::Instance&
}

void TcpHealthCheckerImpl::TcpActiveHealthCheckSession::onEvent(Network::ConnectionEvent event) {
if (event == Network::ConnectionEvent::RemoteClose) {
handleFailure(envoy::data::core::v2alpha::HealthCheckFailureType::NETWORK);
}

if (event == Network::ConnectionEvent::RemoteClose ||
event == Network::ConnectionEvent::LocalClose) {
if (!expect_reset_) {
handleFailure(envoy::data::core::v2alpha::HealthCheckFailureType::NETWORK);
}
parent_.dispatcher_.deferredDelete(std::move(client_));
}

Expand All @@ -403,6 +404,7 @@ void TcpHealthCheckerImpl::TcpActiveHealthCheckSession::onEvent(Network::Connect
// TODO(mattklein123): In the case that a user configured bytes to write, they will not be
// be written, since we currently have no way to know if the bytes actually get written via
// the connection interface. We might want to figure out how to handle this better later.
expect_reset_ = true;
client_->close(Network::ConnectionCloseType::NoFlush);
handleSuccess(false);
}
Expand All @@ -416,6 +418,7 @@ void TcpHealthCheckerImpl::TcpActiveHealthCheckSession::onInterval() {
client_->addConnectionCallbacks(*session_callbacks_);
client_->addReadFilter(session_callbacks_);

expect_reset_ = false;
client_->connect();
client_->noDelay(true);
}
Expand All @@ -431,6 +434,7 @@ void TcpHealthCheckerImpl::TcpActiveHealthCheckSession::onInterval() {
}

void TcpHealthCheckerImpl::TcpActiveHealthCheckSession::onTimeout() {
expect_reset_ = true;
mattklein123 marked this conversation as resolved.
Show resolved Hide resolved
host_->setActiveHealthFailureType(Host::ActiveHealthFailureType::TIMEOUT);
client_->close(Network::ConnectionCloseType::NoFlush);
}
Expand Down
3 changes: 3 additions & 0 deletions source/common/upstream/health_checker_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,9 @@ class TcpHealthCheckerImpl : public HealthCheckerImplBase {
TcpHealthCheckerImpl& parent_;
Network::ClientConnectionPtr client_;
std::shared_ptr<TcpSessionCallbacks> session_callbacks_;
// If true, stream reset was initiated by us, not e.g. remote reset.
// In this case healthcheck status already reported, only state cleanup required.
bool expect_reset_{};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: can you make this expect_close_ and update the comment above to talk about close vs. reset? There is no reset happening for TCP.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

};

typedef std::unique_ptr<TcpActiveHealthCheckSession> TcpActiveHealthCheckSessionPtr;
Expand Down
89 changes: 88 additions & 1 deletion test/common/upstream/health_checker_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2336,7 +2336,7 @@ TEST_F(TcpHealthCheckerImplTest, WrongData) {
Host::ActiveHealthFailureType::UNHEALTHY);
}

TEST_F(TcpHealthCheckerImplTest, Timeout) {
TEST_F(TcpHealthCheckerImplTest, TimeoutThenRemoteClose) {
InSequence s;

setupData();
Expand Down Expand Up @@ -2396,6 +2396,66 @@ TEST_F(TcpHealthCheckerImplTest, Timeout) {
cluster_->prioritySet().getMockHostSet(0)->runCallbacks({}, removed);
}

TEST_F(TcpHealthCheckerImplTest, DoubleTimeout) {
InSequence s;

setupData();
health_checker_->start();

expectSessionCreate();
expectClientCreate();
cluster_->prioritySet().getMockHostSet(0)->hosts_ = {
makeTestHost(cluster_->info_, "tcp://127.0.0.1:80")};
EXPECT_CALL(*connection_, write(_, _));
EXPECT_CALL(*timeout_timer_, enableTimer(_));

cluster_->prioritySet().getMockHostSet(0)->runCallbacks(
{cluster_->prioritySet().getMockHostSet(0)->hosts_.back()}, {});

connection_->raiseEvent(Network::ConnectionEvent::Connected);

Buffer::OwnedImpl response;
add_uint8(response, 1);
read_filter_->onData(response, false);

EXPECT_CALL(*connection_, close(_));
EXPECT_CALL(*event_logger_, logUnhealthy(_, _, _, true));
EXPECT_CALL(*timeout_timer_, disableTimer());
EXPECT_CALL(*interval_timer_, enableTimer(_));
mattklein123 marked this conversation as resolved.
Show resolved Hide resolved
timeout_timer_->callback_();
EXPECT_EQ(cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->getActiveHealthFailureType(),
Host::ActiveHealthFailureType::TIMEOUT);
EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health());

expectClientCreate();
EXPECT_CALL(*connection_, write(_, _));
EXPECT_CALL(*timeout_timer_, enableTimer(_));
interval_timer_->callback_();

connection_->raiseEvent(Network::ConnectionEvent::Connected);

EXPECT_CALL(*connection_, close(_));
EXPECT_CALL(*event_logger_, logEjectUnhealthy(_, _, _));
EXPECT_CALL(*timeout_timer_, disableTimer());
EXPECT_CALL(*interval_timer_, enableTimer(_));
timeout_timer_->callback_();
EXPECT_EQ(cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->getActiveHealthFailureType(),
Host::ActiveHealthFailureType::TIMEOUT);
EXPECT_EQ(Host::Health::Unhealthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health());
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's where I check that two timeouts in a row cause us to mark this host unhealthy


expectClientCreate();
EXPECT_CALL(*connection_, write(_, _));
EXPECT_CALL(*timeout_timer_, enableTimer(_));
interval_timer_->callback_();

connection_->raiseEvent(Network::ConnectionEvent::Connected);

HostVector removed{cluster_->prioritySet().getMockHostSet(0)->hosts_.back()};
cluster_->prioritySet().getMockHostSet(0)->hosts_.clear();
EXPECT_CALL(*connection_, close(_));
cluster_->prioritySet().getMockHostSet(0)->runCallbacks({}, removed);
}

// Tests that when reuse_connection is false timeouts execute normally.
TEST_F(TcpHealthCheckerImplTest, TimeoutWithoutReusingConnection) {
InSequence s;
Expand Down Expand Up @@ -2585,6 +2645,33 @@ TEST_F(TcpHealthCheckerImplTest, PassiveFailureCrossThreadRemoveClusterRace) {
EXPECT_EQ(0UL, cluster_->info_->stats_store_.counter("health_check.passive_failure").value());
}

TEST_F(TcpHealthCheckerImplTest, ConnectionLocalFailure) {
InSequence s;

setupData();
cluster_->prioritySet().getMockHostSet(0)->hosts_ = {
makeTestHost(cluster_->info_, "tcp://127.0.0.1:80")};
expectSessionCreate();
expectClientCreate();
EXPECT_CALL(*connection_, write(_, _));
EXPECT_CALL(*timeout_timer_, enableTimer(_));
health_checker_->start();

// Expect the LocalClose to be handled as a health check failure
EXPECT_CALL(*event_logger_, logUnhealthy(_, _, _, true));
EXPECT_CALL(*timeout_timer_, disableTimer());
EXPECT_CALL(*interval_timer_, enableTimer(_));

// Raise a LocalClose that is not triggered by the health monitor itself.
// e.g. a failure to setsockopt().
connection_->raiseEvent(Network::ConnectionEvent::LocalClose);

EXPECT_EQ(1UL, cluster_->info_->stats_store_.counter("health_check.attempt").value());
EXPECT_EQ(0UL, cluster_->info_->stats_store_.counter("health_check.success").value());
EXPECT_EQ(1UL, cluster_->info_->stats_store_.counter("health_check.failure").value());
EXPECT_EQ(0UL, cluster_->info_->stats_store_.counter("health_check.passive_failure").value());
}

class TestGrpcHealthCheckerImpl : public GrpcHealthCheckerImpl {
public:
using GrpcHealthCheckerImpl::GrpcHealthCheckerImpl;
Expand Down