diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h index 53ad7df1b5e0..4f52ef8dad3c 100644 --- a/include/envoy/upstream/upstream.h +++ b/include/envoy/upstream/upstream.h @@ -22,6 +22,11 @@ class Host : virtual public HostDescription { HostDescriptionPtr host_description_; }; + struct HealthFailures { + // The host is currently failing active health checks. + static const uint64_t ACTIVE_HC = 0x1; + }; + /** * @return host specific counters. */ @@ -44,14 +49,21 @@ class Host : virtual public HostDescription { virtual std::list> gauges() const PURE; /** - * @return bool whether the host is currently healthy and routable. + * @return all health failure states for the host. This is a logical OR of HealthFailures. + */ + virtual uint64_t healthFailures() const PURE; + + /** + * Atomically clear a health failure state for a host. Failure states are specified in + * HealthFailures. */ - virtual bool healthy() const PURE; + virtual void healthFailureClear(uint64_t failure) PURE; /** - * Set whether the host is currently healthy and routable. + * Atomically set a health failure state for a host. Failure states are specified in + * HealthFailures. */ - virtual void healthy(bool is_healthy) PURE; + virtual void healthFailureSet(uint64_t failure) PURE; /** * @return the current load balancing weight of the host, in the range 1-100. diff --git a/source/common/upstream/health_checker_impl.cc b/source/common/upstream/health_checker_impl.cc index d80fca2bb95d..f1cecb293489 100644 --- a/source/common/upstream/health_checker_impl.cc +++ b/source/common/upstream/health_checker_impl.cc @@ -101,13 +101,13 @@ HealthCheckerImplBase::ActiveHealthCheckSession::ActiveHealthCheckSession( interval_timer_(parent.dispatcher_.createTimer([this]() -> void { onInterval(); })), timeout_timer_(parent.dispatcher_.createTimer([this]() -> void { onTimeout(); })) { - if (host->healthy()) { + if (!(host->healthFailures() & Host::HealthFailures::ACTIVE_HC)) { parent.incHealthy(); } } HealthCheckerImplBase::ActiveHealthCheckSession::~ActiveHealthCheckSession() { - if (host_->healthy()) { + if (!(host_->healthFailures() & Host::HealthFailures::ACTIVE_HC)) { parent_.decHealthy(); } } @@ -117,12 +117,12 @@ void HealthCheckerImplBase::ActiveHealthCheckSession::handleSuccess() { num_unhealthy_ = 0; bool changed_state = false; - if (!host_->healthy()) { + if (host_->healthFailures() & Host::HealthFailures::ACTIVE_HC) { // If this is the first time we ever got a check result on this host, we immediately move // it to healthy. This makes startup faster with a small reduction in overall reliability // depending on the HC settings. if (first_check_ || ++num_healthy_ == parent_.healthy_threshold_) { - host_->healthy(true); + host_->healthFailureClear(Host::HealthFailures::ACTIVE_HC); parent_.incHealthy(); changed_state = true; } @@ -138,9 +138,9 @@ void HealthCheckerImplBase::ActiveHealthCheckSession::handleFailure(bool timeout num_healthy_ = 0; bool changed_state = false; - if (host_->healthy()) { + if (!(host_->healthFailures() & Host::HealthFailures::ACTIVE_HC)) { if (!timeout || ++num_unhealthy_ == parent_.unhealthy_threshold_) { - host_->healthy(false); + host_->healthFailureSet(Host::HealthFailures::ACTIVE_HC); parent_.decHealthy(); changed_state = true; } @@ -249,7 +249,7 @@ void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onResetStream(Http::St } timeout_timer_->disableTimer(); - conn_log_debug("connection/stream error host_healthy={}", *client_, host_->healthy()); + conn_log_debug("connection/stream error health_failures={}", *client_, host_->healthFailures()); handleFailure(true); interval_timer_->enableTimer(parent_.interval()); } @@ -261,7 +261,8 @@ bool HttpHealthCheckerImpl::HttpActiveHealthCheckSession::isHealthCheckSucceeded // the host is healthy, we need to see if we have reached the unhealthy count. If a host returns // a response code other than 200 we ignore the number of unhealthy and immediately set it to // unhealthy. - conn_log_debug("hc response={} host_healthy={}", *client_, response_code, host_->healthy()); + conn_log_debug("hc response={} health_failures={}", *client_, response_code, + host_->healthFailures()); if (response_code != enumToInt(Http::Code::OK)) { return false; @@ -298,7 +299,7 @@ void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onResponseComplete() { } void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onTimeout() { - conn_log_debug("connection/stream timeout host_healthy={}", *client_, host_->healthy()); + conn_log_debug("connection/stream timeout health_failures={}", *client_, host_->healthFailures()); handleFailure(true); // If there is an active request it will get reset, so make sure we ignore the reset. diff --git a/source/common/upstream/upstream_impl.cc b/source/common/upstream/upstream_impl.cc index 549376349d49..c3c967d13864 100644 --- a/source/common/upstream/upstream_impl.cc +++ b/source/common/upstream/upstream_impl.cc @@ -83,7 +83,7 @@ ClusterImplBase::ClusterImplBase(const Json::Object& config, Stats::Store& stats ConstHostVectorPtr ClusterImplBase::createHealthyHostList(const std::vector& hosts) { HostVectorPtr healthy_list(new std::vector()); for (auto host : hosts) { - if (host->healthy()) { + if (!host->healthFailures()) { healthy_list->emplace_back(host); } } @@ -213,14 +213,16 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const std::vector& n hosts_added.push_back(host); // If we are depending on a health checker, we initialize to unhealthy. - hosts_added.back()->healthy(!depend_on_hc); + if (depend_on_hc) { + hosts_added.back()->healthFailureSet(Host::HealthFailures::ACTIVE_HC); + } } } // If there are removed hosts, check to see if we should only delete if unhealthy. if (!current_hosts.empty() && depend_on_hc) { for (auto i = current_hosts.begin(); i != current_hosts.end();) { - if ((*i)->healthy()) { + if (!((*i)->healthFailures() & Host::HealthFailures::ACTIVE_HC)) { if ((*i)->weight() > max_host_weight) { max_host_weight = (*i)->weight(); } diff --git a/source/common/upstream/upstream_impl.h b/source/common/upstream/upstream_impl.h index d1d219d17315..c099b42650e7 100644 --- a/source/common/upstream/upstream_impl.h +++ b/source/common/upstream/upstream_impl.h @@ -67,8 +67,9 @@ class HostImpl : public HostDescriptionImpl, std::list> gauges() const override { return stats_store_.gauges(); } - bool healthy() const override { return healthy_; } - void healthy(bool is_healthy) override { healthy_ = is_healthy; } + uint64_t healthFailures() const override { return health_failures_; } + void healthFailureClear(uint64_t failure) override { health_failures_ &= ~failure; } + void healthFailureSet(uint64_t failure) override { health_failures_ |= failure; } uint32_t weight() const override { return weight_; } void weight(uint32_t new_weight); @@ -77,7 +78,7 @@ class HostImpl : public HostDescriptionImpl, createConnection(Event::Dispatcher& dispatcher, const Cluster& cluster, const std::string& url); private: - std::atomic healthy_{true}; + std::atomic health_failures_{}; std::atomic weight_; }; diff --git a/source/server/http/admin.cc b/source/server/http/admin.cc index 70924581885c..0c09c2cdbd33 100644 --- a/source/server/http/admin.cc +++ b/source/server/http/admin.cc @@ -117,8 +117,8 @@ Http::Code AdminImpl::handlerClusters(const std::string&, Buffer::Instance& resp stat.first, stat.second)); } - response.add(fmt::format("{}::{}::healthy::{}\n", cluster.second->name(), host->url(), - host->healthy())); + response.add(fmt::format("{}::{}::health_failures::{}\n", cluster.second->name(), host->url(), + host->healthFailures())); response.add( fmt::format("{}::{}::weight::{}\n", cluster.second->name(), host->url(), host->weight())); response.add( diff --git a/test/common/upstream/health_checker_impl_test.cc b/test/common/upstream/health_checker_impl_test.cc index 4e20bb073a8b..280c5c2ddac5 100644 --- a/test/common/upstream/health_checker_impl_test.cc +++ b/test/common/upstream/health_checker_impl_test.cc @@ -17,6 +17,9 @@ using testing::SaveArg; namespace Upstream { +// Satisfy linker +const uint64_t Host::HealthFailures::ACTIVE_HC; + class TestHttpHealthCheckerImpl : public HttpHealthCheckerImpl { public: using HttpHealthCheckerImpl::HttpHealthCheckerImpl; @@ -171,7 +174,7 @@ TEST_F(HttpHealthCheckerImplTest, Success) { .WillOnce(Return(45000)); EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(std::chrono::milliseconds(45000))); respond(0, "200", false, true); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, SuccessServiceCheck) { @@ -193,7 +196,7 @@ TEST_F(HttpHealthCheckerImplTest, SuccessServiceCheck) { EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(std::chrono::milliseconds(45000))); Optional health_checked_cluster("locations-production-iad"); respond(0, "200", false, true, false, health_checked_cluster); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, ServiceDoesNotMatchFail) { @@ -215,7 +218,7 @@ TEST_F(HttpHealthCheckerImplTest, ServiceDoesNotMatchFail) { EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(std::chrono::milliseconds(45000))); Optional health_checked_cluster("api-production-iad"); respond(0, "200", false, true, false, health_checked_cluster); - EXPECT_FALSE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(Host::HealthFailures::ACTIVE_HC, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, ServiceNotPresentInResponseFail) { @@ -236,7 +239,7 @@ TEST_F(HttpHealthCheckerImplTest, ServiceNotPresentInResponseFail) { .WillOnce(Return(45000)); EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(std::chrono::milliseconds(45000))); respond(0, "200", false, true, false); - EXPECT_FALSE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(Host::HealthFailures::ACTIVE_HC, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, ServiceCheckRuntimeOff) { @@ -258,7 +261,7 @@ TEST_F(HttpHealthCheckerImplTest, ServiceCheckRuntimeOff) { EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(std::chrono::milliseconds(45000))); Optional health_checked_cluster("api-production-iad"); respond(0, "200", false, true, false, health_checked_cluster); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, SuccessStartFailedFailFirstServiceCheck) { @@ -266,7 +269,7 @@ TEST_F(HttpHealthCheckerImplTest, SuccessStartFailedFailFirstServiceCheck) { EXPECT_CALL(runtime_.snapshot_, featureEnabled("health_check.verify_cluster", 100)) .WillRepeatedly(Return(true)); cluster_->hosts_ = {HostPtr{new HostImpl(*cluster_, "tcp://127.0.0.1:80", false, 1, "")}}; - cluster_->hosts_[0]->healthy(false); + cluster_->hosts_[0]->healthFailureSet(Host::HealthFailures::ACTIVE_HC); expectSessionCreate(); expectStreamCreate(0); health_checker_->start(); @@ -275,19 +278,19 @@ TEST_F(HttpHealthCheckerImplTest, SuccessStartFailedFailFirstServiceCheck) { // Test that failing first disables fast success. EXPECT_CALL(*this, onHostStatus(_, false)); respond(0, "503", false, false, false, health_checked_cluster); - EXPECT_FALSE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(Host::HealthFailures::ACTIVE_HC, cluster_->hosts_[0]->healthFailures()); expectStreamCreate(0); test_sessions_[0]->interval_timer_->callback_(); EXPECT_CALL(*this, onHostStatus(_, false)); respond(0, "200", false, false, false, health_checked_cluster); - EXPECT_FALSE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(Host::HealthFailures::ACTIVE_HC, cluster_->hosts_[0]->healthFailures()); expectStreamCreate(0); test_sessions_[0]->interval_timer_->callback_(); EXPECT_CALL(*this, onHostStatus(_, true)); respond(0, "200", false, false, false, health_checked_cluster); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, SuccessNoTraffic) { @@ -301,13 +304,13 @@ TEST_F(HttpHealthCheckerImplTest, SuccessNoTraffic) { EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(std::chrono::milliseconds(60000))); respond(0, "200", false, true, true); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, SuccessStartFailedSuccessFirst) { setupNoServiceValidationHC(); cluster_->hosts_ = {HostPtr{new HostImpl(*cluster_, "tcp://127.0.0.1:80", false, 1, "")}}; - cluster_->hosts_[0]->healthy(false); + cluster_->hosts_[0]->healthFailureSet(Host::HealthFailures::ACTIVE_HC); expectSessionCreate(); expectStreamCreate(0); health_checker_->start(); @@ -318,13 +321,13 @@ TEST_F(HttpHealthCheckerImplTest, SuccessStartFailedSuccessFirst) { EXPECT_CALL(runtime_.snapshot_, getInteger("health_check.min_interval", _)); EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(std::chrono::milliseconds(500))); respond(0, "200", false); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, SuccessStartFailedFailFirst) { setupNoServiceValidationHC(); cluster_->hosts_ = {HostPtr{new HostImpl(*cluster_, "tcp://127.0.0.1:80", false, 1, "")}}; - cluster_->hosts_[0]->healthy(false); + cluster_->hosts_[0]->healthFailureSet(Host::HealthFailures::ACTIVE_HC); expectSessionCreate(); expectStreamCreate(0); health_checker_->start(); @@ -332,19 +335,19 @@ TEST_F(HttpHealthCheckerImplTest, SuccessStartFailedFailFirst) { // Test that failing first disables fast success. EXPECT_CALL(*this, onHostStatus(_, false)); respond(0, "503", false); - EXPECT_FALSE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(Host::HealthFailures::ACTIVE_HC, cluster_->hosts_[0]->healthFailures()); expectStreamCreate(0); test_sessions_[0]->interval_timer_->callback_(); EXPECT_CALL(*this, onHostStatus(_, false)); respond(0, "200", false); - EXPECT_FALSE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(Host::HealthFailures::ACTIVE_HC, cluster_->hosts_[0]->healthFailures()); expectStreamCreate(0); test_sessions_[0]->interval_timer_->callback_(); EXPECT_CALL(*this, onHostStatus(_, true)); respond(0, "200", false); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, HttpFail) { @@ -356,19 +359,19 @@ TEST_F(HttpHealthCheckerImplTest, HttpFail) { EXPECT_CALL(*this, onHostStatus(_, true)); respond(0, "503", false); - EXPECT_FALSE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(Host::HealthFailures::ACTIVE_HC, cluster_->hosts_[0]->healthFailures()); expectStreamCreate(0); test_sessions_[0]->interval_timer_->callback_(); EXPECT_CALL(*this, onHostStatus(_, false)); respond(0, "200", false); - EXPECT_FALSE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(Host::HealthFailures::ACTIVE_HC, cluster_->hosts_[0]->healthFailures()); expectStreamCreate(0); test_sessions_[0]->interval_timer_->callback_(); EXPECT_CALL(*this, onHostStatus(_, true)); respond(0, "200", false); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, Disconnect) { @@ -381,7 +384,7 @@ TEST_F(HttpHealthCheckerImplTest, Disconnect) { health_checker_->start(); test_sessions_[0]->client_connection_->raiseEvents(Network::ConnectionEvent::RemoteClose); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); expectClientCreate(0); expectStreamCreate(0); @@ -389,7 +392,7 @@ TEST_F(HttpHealthCheckerImplTest, Disconnect) { EXPECT_CALL(*this, onHostStatus(cluster_->hosts_[0], true)); test_sessions_[0]->client_connection_->raiseEvents(Network::ConnectionEvent::RemoteClose); - EXPECT_FALSE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(Host::HealthFailures::ACTIVE_HC, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, Timeout) { @@ -402,7 +405,7 @@ TEST_F(HttpHealthCheckerImplTest, Timeout) { EXPECT_CALL(*this, onHostStatus(_, false)); EXPECT_CALL(*test_sessions_[0]->client_connection_, close(_)); test_sessions_[0]->timeout_timer_->callback_(); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); expectClientCreate(0); expectStreamCreate(0); @@ -410,7 +413,7 @@ TEST_F(HttpHealthCheckerImplTest, Timeout) { EXPECT_CALL(*this, onHostStatus(_, true)); test_sessions_[0]->client_connection_->raiseEvents(Network::ConnectionEvent::RemoteClose); - EXPECT_FALSE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(Host::HealthFailures::ACTIVE_HC, cluster_->hosts_[0]->healthFailures()); } TEST_F(HttpHealthCheckerImplTest, DynamicAddAndRemove) { @@ -439,7 +442,7 @@ TEST_F(HttpHealthCheckerImplTest, ConnectionClose) { EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_)); respond(0, "200", true); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); expectClientCreate(0); expectStreamCreate(0); @@ -457,7 +460,7 @@ TEST_F(HttpHealthCheckerImplTest, RemoteCloseBetweenChecks) { EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_)); respond(0, "200", false); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); test_sessions_[0]->client_connection_->raiseEvents(Network::ConnectionEvent::RemoteClose); @@ -466,7 +469,7 @@ TEST_F(HttpHealthCheckerImplTest, RemoteCloseBetweenChecks) { test_sessions_[0]->interval_timer_->callback_(); EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_)); respond(0, "200", false); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); } TEST(TcpHealthCheckMatcher, loadJsonBytes) { @@ -630,7 +633,7 @@ TEST_F(TcpHealthCheckerImplTest, Timeout) { EXPECT_CALL(*interval_timer_, enableTimer(_)); EXPECT_CALL(*connection_, close(_)); timeout_timer_->callback_(); - EXPECT_TRUE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(0UL, cluster_->hosts_[0]->healthFailures()); expectClientCreate(); interval_timer_->callback_(); @@ -638,7 +641,7 @@ TEST_F(TcpHealthCheckerImplTest, Timeout) { EXPECT_CALL(*timeout_timer_, disableTimer()); EXPECT_CALL(*interval_timer_, enableTimer(_)); connection_->raiseEvents(Network::ConnectionEvent::RemoteClose); - EXPECT_FALSE(cluster_->hosts_[0]->healthy()); + EXPECT_EQ(Host::HealthFailures::ACTIVE_HC, cluster_->hosts_[0]->healthFailures()); expectClientCreate(); interval_timer_->callback_(); diff --git a/test/common/upstream/sds_test.cc b/test/common/upstream/sds_test.cc index 8fe831ca096f..016f93bc056c 100644 --- a/test/common/upstream/sds_test.cc +++ b/test/common/upstream/sds_test.cc @@ -52,7 +52,7 @@ class SdsTest : public testing::Test { uint64_t numHealthy() { uint64_t healthy = 0; for (HostPtr host : cluster_->hosts()) { - if (host->healthy()) { + if (!host->healthFailures()) { healthy++; } } @@ -202,7 +202,7 @@ TEST_F(SdsTest, HealthChecker) { // Now run through and make all the hosts healthy except for the first one. for (size_t i = 1; i < cluster_->hosts().size(); i++) { - cluster_->hosts()[i]->healthy(true); + cluster_->hosts()[i]->healthFailureClear(Host::HealthFailures::ACTIVE_HC); health_checker->runCallbacks(cluster_->hosts()[i], true); } @@ -212,7 +212,7 @@ TEST_F(SdsTest, HealthChecker) { // Do the last one now which should fire the initialized event. EXPECT_CALL(membership_updated_, ready()); - cluster_->hosts()[0]->healthy(true); + cluster_->hosts()[0]->healthFailureClear(Host::HealthFailures::ACTIVE_HC); health_checker->runCallbacks(cluster_->hosts()[0], true); EXPECT_EQ(13UL, cluster_->healthyHosts().size()); EXPECT_EQ(4UL, cluster_->localZoneHosts().size()); @@ -236,7 +236,7 @@ TEST_F(SdsTest, HealthChecker) { // Now set one of the removed hosts to unhealthy, and return the same query again, this should // remove it. - findHost("10.0.5.0")->healthy(false); + findHost("10.0.5.0")->healthFailureSet(Host::HealthFailures::ACTIVE_HC); setupRequest(); timer_->callback_(); EXPECT_CALL(*timer_, enableTimer(_));