From a252c218c9dc154919a58ca55cc4017ed6a2e017 Mon Sep 17 00:00:00 2001 From: Roman Dzhabarov Date: Wed, 21 Sep 2016 16:47:01 -0700 Subject: [PATCH 01/12] Zone aware routing in Envoy. Conflicts: source/common/upstream/load_balancer_impl.cc --- include/envoy/upstream/upstream.h | 6 +- source/common/upstream/load_balancer_impl.cc | 37 ++++++++++- source/common/upstream/upstream_impl.cc | 7 ++ .../upstream/load_balancer_impl_test.cc | 64 ++++++++++++++++++- test/common/upstream/sds_test.cc | 4 ++ test/mocks/upstream/mocks.cc | 2 + test/mocks/upstream/mocks.h | 2 + 7 files changed, 118 insertions(+), 4 deletions(-) diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h index f41c9ef84938..53ad7df1b5e0 100644 --- a/include/envoy/upstream/upstream.h +++ b/include/envoy/upstream/upstream.h @@ -160,7 +160,11 @@ class HostSet { COUNTER(update_attempt) \ COUNTER(update_success) \ COUNTER(update_failure) \ - GAUGE (max_host_weight) + GAUGE (max_host_weight) \ + GAUGE (upstream_zone_count) \ + COUNTER(upstream_zone_above_threshold) \ + COUNTER(upstream_zone_healthy_panic) \ + COUNTER(upstream_zone_within_threshold) // clang-format on /** diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index 5c57136b630f..8b381927e87e 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -14,18 +14,51 @@ const std::vector& LoadBalancerBase::hostsToUse() { return host_set_.hosts(); } - uint64_t threshold = + uint64_t global_panic_threshold = std::min(100UL, runtime_.snapshot().getInteger("upstream.healthy_panic_threshold", 50)); double healthy_percent = (host_set_.healthyHosts().size() / static_cast(host_set_.hosts().size())) * 100; // If the % of healthy hosts in the cluster is less than our panic threshold, we use all hosts. - if (healthy_percent < threshold) { + if (healthy_percent < global_panic_threshold) { stats_.upstream_rq_lb_healthy_panic_.inc(); return host_set_.hosts(); } else { return host_set_.healthyHosts(); } + + // Check if we need to perform zone aware routing, by default disabled. + if (runtime_.snapshot().featureEnabled("upstream.zone_routing.enabled", 0)) { + double zone_to_all_percent = + 100.0 * host_set_.localZoneHealthyHosts().size() / host_set_.healthyHosts().size(); + double expected_percent = 100.0 / stats_.upstream_zone_count_.value(); + + uint64_t zone_percent_diff = + runtime_.snapshot().getInteger("upstream.zone_routing.percent_diff", 0); + + // Hosts should be roughly equally distributed between zones. + if (std::abs(zone_to_all_percent - expected_percent) > zone_percent_diff) { + stats_.upstream_zone_above_threshold_.inc(); + + return host_set_.healthyHosts(); + } else { + stats_.upstream_zone_within_threshold_.inc(); + } + + uint64_t zone_panic_threshold = + runtime_.snapshot().getInteger("upstream.zone_routing.healthy_panic_threshold", 80); + double zone_healthy_percent = + 100.0 * host_set_.localZoneHealthyHosts().size() / host_set_.localZoneHosts().size(); + if (zone_healthy_percent < zone_panic_threshold) { + stats_.upstream_zone_healthy_panic_.inc(); + + return host_set_.healthyHosts(); + } + + return host_set_.localZoneHealthyHosts(); + } + + return host_set_.healthyHosts(); } ConstHostPtr RoundRobinLoadBalancer::chooseHost() { diff --git a/source/common/upstream/upstream_impl.cc b/source/common/upstream/upstream_impl.cc index 6a0e267672d1..04d970f8f99d 100644 --- a/source/common/upstream/upstream_impl.cc +++ b/source/common/upstream/upstream_impl.cc @@ -1,3 +1,4 @@ +#include #include "upstream_impl.h" #include "envoy/event/dispatcher.h" @@ -177,6 +178,7 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const std::vector& n std::vector& hosts_removed, bool depend_on_hc) { uint64_t max_host_weight = 1; + std::unordered_set zones; // Go through and see if the list we have is different from what we just got. If it is, we // make a new host list and raise a change notification. This uses an N^2 search given that @@ -185,6 +187,8 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const std::vector& n for (HostPtr host : new_hosts) { bool found = false; for (auto i = current_hosts.begin(); i != current_hosts.end();) { + zones.insert((*i)->zone()); + // If we find a host matched based on URL, we keep it. However we do change weight inline so // do that here. if ((*i)->url() == host->url()) { @@ -205,6 +209,7 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const std::vector& n if (host->weight() > max_host_weight) { max_host_weight = host->weight(); } + zones.insert(host->zone()); final_hosts.push_back(host); hosts_added.push_back(host); @@ -221,6 +226,7 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const std::vector& n if ((*i)->weight() > max_host_weight) { max_host_weight = (*i)->weight(); } + zones.insert((*i)->zone()); final_hosts.push_back(*i); i = current_hosts.erase(i); @@ -231,6 +237,7 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const std::vector& n } stats_.max_host_weight_.set(max_host_weight); + stats_.upstream_zone_count_.set(zones.size()); if (!hosts_added.empty() || !current_hosts.empty()) { hosts_removed = std::move(current_hosts); diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index 94f132bde776..35d02ad03753 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -66,6 +66,68 @@ TEST_F(RoundRobinLoadBalancerTest, MaxUnhealthyPanic) { EXPECT_EQ(3UL, stats_.upstream_rq_lb_healthy_panic_.value()); } +TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingDone) { + cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")}; + cluster_.hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")}; + cluster_.local_zone_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; + cluster_.local_zone_healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; + stats_.upstream_zone_count_.set(3UL); + + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 0)) + .WillRepeatedly(Return(true)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) + .WillRepeatedly(Return(80)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 0)) + .WillRepeatedly(Return(2)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) + .WillRepeatedly(Return(50)); + + // There is only one host in the given zone for zone aware routing. + EXPECT_EQ(cluster_.local_zone_healthy_hosts_[0], lb_.chooseHost()); + EXPECT_EQ(1UL, stats_.upstream_zone_within_threshold_.value()); + + EXPECT_EQ(cluster_.local_zone_healthy_hosts_[0], lb_.chooseHost()); + EXPECT_EQ(2UL, stats_.upstream_zone_within_threshold_.value()); + + // Disable runtime global zone routing. + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 0)) + .WillRepeatedly(Return(false)); + EXPECT_EQ(cluster_.healthy_hosts_[2], lb_.chooseHost()); + EXPECT_EQ(2UL, stats_.upstream_zone_within_threshold_.value()); +} + +TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotHealthy) { + cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")}; + cluster_.hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")}; + cluster_.local_zone_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; + cluster_.local_zone_healthy_hosts_ = {}; + stats_.upstream_zone_count_.set(3UL); + + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 0)) + .WillRepeatedly(Return(true)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) + .WillRepeatedly(Return(80)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 0)) + .WillRepeatedly(Return(2)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) + .WillRepeatedly(Return(50)); + + // There is only one host in the given zone for zone aware routing. + EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost()); + EXPECT_EQ(1UL, stats_.upstream_zone_above_threshold_.value()); + + EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost()); + EXPECT_EQ(2UL, stats_.upstream_zone_above_threshold_.value()); +} + class LeastRequestLoadBalancerTest : public testing::Test { public: LeastRequestLoadBalancerTest() : stats_(ClusterImplBase::generateStats("", stats_store_)) {} @@ -135,7 +197,7 @@ TEST_F(LeastRequestLoadBalancerTest, Normal) { EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost()); } -TEST_F(LeastRequestLoadBalancerTest, WeightImbalanceRuntimOff) { +TEST_F(LeastRequestLoadBalancerTest, WeightImbalanceRuntimeOff) { // Disable weight balancing. EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.weight_enabled", 1)) .WillRepeatedly(Return(0)); diff --git a/test/common/upstream/sds_test.cc b/test/common/upstream/sds_test.cc index 59a10ccd6b17..50764d276580 100644 --- a/test/common/upstream/sds_test.cc +++ b/test/common/upstream/sds_test.cc @@ -129,6 +129,7 @@ TEST_F(SdsTest, NoHealthChecker) { EXPECT_EQ("us-east-1d", canary_host->zone()); EXPECT_EQ(1U, canary_host->weight()); EXPECT_EQ(1UL, cluster_->stats().max_host_weight_.value()); + EXPECT_EQ(3UL, cluster_->stats().upstream_zone_count_.value()); // Test response with weight change. We should still have the same host. setupRequest(); @@ -146,6 +147,7 @@ TEST_F(SdsTest, NoHealthChecker) { EXPECT_EQ("us-east-1d", canary_host->zone()); EXPECT_EQ(50U, canary_host->weight()); EXPECT_EQ(50UL, cluster_->stats().max_host_weight_.value()); + EXPECT_EQ(3UL, cluster_->stats().upstream_zone_count_.value()); // Now test the failure case, our cluster size should not change. setupRequest(); @@ -156,6 +158,7 @@ TEST_F(SdsTest, NoHealthChecker) { EXPECT_EQ(13UL, cluster_->hosts().size()); EXPECT_EQ(50U, canary_host->weight()); EXPECT_EQ(50UL, cluster_->stats().max_host_weight_.value()); + EXPECT_EQ(3UL, cluster_->stats().upstream_zone_count_.value()); // 503 response. setupRequest(); @@ -168,6 +171,7 @@ TEST_F(SdsTest, NoHealthChecker) { EXPECT_EQ(13UL, cluster_->hosts().size()); EXPECT_EQ(50U, canary_host->weight()); EXPECT_EQ(50UL, cluster_->stats().max_host_weight_.value()); + EXPECT_EQ(3UL, cluster_->stats().upstream_zone_count_.value()); } TEST_F(SdsTest, HealthChecker) { diff --git a/test/mocks/upstream/mocks.cc b/test/mocks/upstream/mocks.cc index bf0ff5b2a440..5396113decb6 100644 --- a/test/mocks/upstream/mocks.cc +++ b/test/mocks/upstream/mocks.cc @@ -25,6 +25,8 @@ MockCluster::MockCluster() ON_CALL(*this, connectTimeout()).WillByDefault(Return(std::chrono::milliseconds(1))); ON_CALL(*this, hosts()).WillByDefault(ReturnRef(hosts_)); ON_CALL(*this, healthyHosts()).WillByDefault(ReturnRef(healthy_hosts_)); + ON_CALL(*this, localZoneHosts()).WillByDefault(ReturnRef(local_zone_hosts_)); + ON_CALL(*this, localZoneHealthyHosts()).WillByDefault(ReturnRef(local_zone_healthy_hosts_)); ON_CALL(*this, name()).WillByDefault(ReturnRef(name_)); ON_CALL(*this, altStatName()).WillByDefault(ReturnRef(alt_stat_name_)); ON_CALL(*this, lbType()).WillByDefault(Return(Upstream::LoadBalancerType::RoundRobin)); diff --git a/test/mocks/upstream/mocks.h b/test/mocks/upstream/mocks.h index 98d6f4e81072..62cb1ac2525f 100644 --- a/test/mocks/upstream/mocks.h +++ b/test/mocks/upstream/mocks.h @@ -47,6 +47,8 @@ class MockCluster : public Cluster { std::vector hosts_; std::vector healthy_hosts_; + std::vector local_zone_hosts_; + std::vector local_zone_healthy_hosts_; std::string name_{"fake_cluster"}; std::string alt_stat_name_{"fake_alt_cluster"}; std::list callbacks_; From 81a7ef724eecd7992199d7e32c91dff45ba491e1 Mon Sep 17 00:00:00 2001 From: Roman Dzhabarov Date: Wed, 21 Sep 2016 17:38:18 -0700 Subject: [PATCH 02/12] Put some docs around zone aware routing. --- .../configuration/cluster_manager/cluster_runtime.rst | 11 +++++++++++ docs/intro/arch_overview/load_balancing.rst | 8 ++++++++ source/common/upstream/load_balancer_impl.cc | 2 -- source/common/upstream/upstream_impl.cc | 1 - 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index 1f717c5d7af8..8056532b3298 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -31,3 +31,14 @@ upstream.use_http2 upstream.weight_enabled Binary switch to turn on or off weighted load balancing. If set to non 0, weighted load balancing is enabled. Defaults to enabled. + +upstream.zone_routing.enabled + What % of requests will be attempted to be routed to the same upstream zone. Defaults to 0%. + +upstream.zone_routing.percent_diff + Perform zone aware routing only if percent of upstream hosts in the same zone within + the percent_diff of expected. Expected is calculated as 100 / number_of_zones. + +upstream.zone_routing.healthy_panic_threshold + Defines the :ref:`zone healthy panic threshold ` via runtime. The panic threshold is used to avoid a situation in which host failures cascade throughout the cluster as load increases. + +Zone panic threshold +-------------------- + +When Envoy performs zone aware routing it will send traffic to the same upstream zone +only if percent of healthy hosts in the given upstream zone is bigger than this threshold. +Default panic threshold is 80%. This is :ref:`configurable ` +via runtime. diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index 8b381927e87e..7255e8ffd08a 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -23,8 +23,6 @@ const std::vector& LoadBalancerBase::hostsToUse() { if (healthy_percent < global_panic_threshold) { stats_.upstream_rq_lb_healthy_panic_.inc(); return host_set_.hosts(); - } else { - return host_set_.healthyHosts(); } // Check if we need to perform zone aware routing, by default disabled. diff --git a/source/common/upstream/upstream_impl.cc b/source/common/upstream/upstream_impl.cc index 04d970f8f99d..c9e647e38b07 100644 --- a/source/common/upstream/upstream_impl.cc +++ b/source/common/upstream/upstream_impl.cc @@ -1,4 +1,3 @@ -#include #include "upstream_impl.h" #include "envoy/event/dispatcher.h" From a39814ea93caf69cf366171698bde233b0271362 Mon Sep 17 00:00:00 2001 From: Roman Dzhabarov Date: Wed, 21 Sep 2016 20:14:23 -0700 Subject: [PATCH 03/12] Fix doc issue. --- docs/configuration/cluster_manager/cluster_runtime.rst | 2 +- docs/intro/arch_overview/load_balancing.rst | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index 8056532b3298..fe98f5ddf316 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -40,5 +40,5 @@ upstream.zone_routing.percent_diff the percent_diff of expected. Expected is calculated as 100 / number_of_zones. upstream.zone_routing.healthy_panic_threshold - Defines the :ref:`zone healthy panic threshold ` percentage. Defaults to 80%. diff --git a/docs/intro/arch_overview/load_balancing.rst b/docs/intro/arch_overview/load_balancing.rst index ccf3d8f59def..6d0ba777fb2e 100644 --- a/docs/intro/arch_overview/load_balancing.rst +++ b/docs/intro/arch_overview/load_balancing.rst @@ -51,6 +51,8 @@ panic threshold is 50%. This is :ref:`configurable Date: Thu, 22 Sep 2016 16:29:59 -0700 Subject: [PATCH 04/12] tmp. --- .../cluster_manager/cluster_runtime.rst | 2 +- include/envoy/upstream/upstream.h | 1 + source/common/upstream/load_balancer_impl.cc | 7 ++--- .../upstream/load_balancer_impl_test.cc | 26 ++++++++++++++++--- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index fe98f5ddf316..b29672215a89 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -33,7 +33,7 @@ upstream.weight_enabled is enabled. Defaults to enabled. upstream.zone_routing.enabled - What % of requests will be attempted to be routed to the same upstream zone. Defaults to 0%. + What % of requests will be attempted to be routed to the same upstream zone. By default enabled. upstream.zone_routing.percent_diff Perform zone aware routing only if percent of upstream hosts in the same zone within diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h index 53ad7df1b5e0..ae5a6c2a63f8 100644 --- a/include/envoy/upstream/upstream.h +++ b/include/envoy/upstream/upstream.h @@ -2,6 +2,7 @@ #include "envoy/common/optional.h" #include "envoy/network/connection.h" +#include "envoy/runtime/runtime.h" #include "envoy/ssl/context.h" #include "envoy/upstream/load_balancer_type.h" #include "envoy/upstream/resource_manager.h" diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index 7255e8ffd08a..25553ab5cdd6 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -25,14 +25,15 @@ const std::vector& LoadBalancerBase::hostsToUse() { return host_set_.hosts(); } - // Check if we need to perform zone aware routing, by default disabled. - if (runtime_.snapshot().featureEnabled("upstream.zone_routing.enabled", 0)) { + // Attempt to do zone aware routing if there are at least 2 upstream zones and it's enabled. + if (stats_.upstream_zone_count_.value() > 1 && + runtime_.snapshot().featureEnabled("upstream.zone_routing.enabled", 100)) { double zone_to_all_percent = 100.0 * host_set_.localZoneHealthyHosts().size() / host_set_.healthyHosts().size(); double expected_percent = 100.0 / stats_.upstream_zone_count_.value(); uint64_t zone_percent_diff = - runtime_.snapshot().getInteger("upstream.zone_routing.percent_diff", 0); + runtime_.snapshot().getInteger("upstream.zone_routing.percent_diff", 3); // Hosts should be roughly equally distributed between zones. if (std::abs(zone_to_all_percent - expected_percent) > zone_percent_diff) { diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index 35d02ad03753..dfe621498c3a 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -77,7 +77,7 @@ TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingDone) { cluster_.local_zone_healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; stats_.upstream_zone_count_.set(3UL); - EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 0)) + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) .WillRepeatedly(Return(true)); EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) .WillRepeatedly(Return(80)); @@ -94,12 +94,32 @@ TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingDone) { EXPECT_EQ(2UL, stats_.upstream_zone_within_threshold_.value()); // Disable runtime global zone routing. - EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 0)) + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) .WillRepeatedly(Return(false)); EXPECT_EQ(cluster_.healthy_hosts_[2], lb_.chooseHost()); EXPECT_EQ(2UL, stats_.upstream_zone_within_threshold_.value()); } +TEST_F(RoundRobinLoadBalancerTest, NoZoneAwareRoutingOneZone) { + cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80")}; + cluster_.hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80")}; + cluster_.local_zone_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80")}; + cluster_.local_zone_healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80")}; + stats_.upstream_zone_count_.set(1UL); + + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)).Times(0); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) + .Times(0); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 0)).Times(0); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) + .WillRepeatedly(Return(50)); + + // There is only one host in the given zone for zone aware routing. + EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost()); + EXPECT_EQ(0UL, stats_.upstream_zone_within_threshold_.value()); + EXPECT_EQ(0UL, stats_.upstream_zone_above_threshold_.value()); +} + TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotHealthy) { cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), @@ -111,7 +131,7 @@ TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotHealthy) { cluster_.local_zone_healthy_hosts_ = {}; stats_.upstream_zone_count_.set(3UL); - EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 0)) + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) .WillRepeatedly(Return(true)); EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) .WillRepeatedly(Return(80)); From 83a02ad465be8d9f52d1be677bbf9fb4d059c9bd Mon Sep 17 00:00:00 2001 From: Roman Dzhabarov Date: Thu, 22 Sep 2016 16:45:02 -0700 Subject: [PATCH 05/12] Fix test, supply 3 as default diff. --- test/common/upstream/load_balancer_impl_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index dfe621498c3a..2c889e0649b6 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -81,7 +81,7 @@ TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingDone) { .WillRepeatedly(Return(true)); EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) .WillRepeatedly(Return(80)); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 0)) + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)) .WillRepeatedly(Return(2)); EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) .WillRepeatedly(Return(50)); @@ -110,7 +110,7 @@ TEST_F(RoundRobinLoadBalancerTest, NoZoneAwareRoutingOneZone) { EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)).Times(0); EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) .Times(0); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 0)).Times(0); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)).Times(0); EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) .WillRepeatedly(Return(50)); @@ -135,7 +135,7 @@ TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotHealthy) { .WillRepeatedly(Return(true)); EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) .WillRepeatedly(Return(80)); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 0)) + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)) .WillRepeatedly(Return(2)); EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) .WillRepeatedly(Return(50)); From 9cc4d61032814d2637357766f8527c84823468c8 Mon Sep 17 00:00:00 2001 From: Roman Dzhabarov Date: Thu, 22 Sep 2016 16:48:44 -0700 Subject: [PATCH 06/12] small fixes. --- include/envoy/upstream/upstream.h | 1 - test/common/upstream/load_balancer_impl_test.cc | 1 - 2 files changed, 2 deletions(-) diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h index ae5a6c2a63f8..53ad7df1b5e0 100644 --- a/include/envoy/upstream/upstream.h +++ b/include/envoy/upstream/upstream.h @@ -2,7 +2,6 @@ #include "envoy/common/optional.h" #include "envoy/network/connection.h" -#include "envoy/runtime/runtime.h" #include "envoy/ssl/context.h" #include "envoy/upstream/load_balancer_type.h" #include "envoy/upstream/resource_manager.h" diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index 2c889e0649b6..fa7e4334924b 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -114,7 +114,6 @@ TEST_F(RoundRobinLoadBalancerTest, NoZoneAwareRoutingOneZone) { EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) .WillRepeatedly(Return(50)); - // There is only one host in the given zone for zone aware routing. EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost()); EXPECT_EQ(0UL, stats_.upstream_zone_within_threshold_.value()); EXPECT_EQ(0UL, stats_.upstream_zone_above_threshold_.value()); From ae480e869fee4a2315910e7140866133e8151a2b Mon Sep 17 00:00:00 2001 From: Roman Dzhabarov Date: Thu, 22 Sep 2016 19:17:57 -0700 Subject: [PATCH 07/12] Fix doc a bit. --- docs/configuration/cluster_manager/cluster_runtime.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index b29672215a89..20d9b90887a8 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -33,7 +33,8 @@ upstream.weight_enabled is enabled. Defaults to enabled. upstream.zone_routing.enabled - What % of requests will be attempted to be routed to the same upstream zone. By default enabled. + What % of requests will be attempted to be routed to the same upstream zone. By default enabled + to 100% of requests. upstream.zone_routing.percent_diff Perform zone aware routing only if percent of upstream hosts in the same zone within From cd213c1a69d0364fbacf86b8cb41cb4d64848abe Mon Sep 17 00:00:00 2001 From: Matt Klein Date: Sat, 24 Sep 2016 09:09:15 -0700 Subject: [PATCH 08/12] Update cluster_runtime.rst --- .../configuration/cluster_manager/cluster_runtime.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index 20d9b90887a8..188b24830b4f 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -33,13 +33,14 @@ upstream.weight_enabled is enabled. Defaults to enabled. upstream.zone_routing.enabled - What % of requests will be attempted to be routed to the same upstream zone. By default enabled - to 100% of requests. + % of requests that will be routed to the same upstream zone. Defaults to 100% of requests. upstream.zone_routing.percent_diff - Perform zone aware routing only if percent of upstream hosts in the same zone within - the percent_diff of expected. Expected is calculated as 100 / number_of_zones. + Zone aware routing will be used only if the percent of upstream hosts in the same zone is within + percent_diff of expected. Expected is calculated as 100 / number_of_zones. This prevents Envoy + from using same zone routing if the zones are not balanced well. upstream.zone_routing.healthy_panic_threshold Defines the :ref:`zone healthy panic threshold ` - percentage. Defaults to 80%. + percentage. Defaults to 80%. If the % of healthy hosts in the current zone falls below this % + all healthy hosts will be used for routing. From 3dadb9c8f2aa4ba68aed196a4814e00b3cd3f549 Mon Sep 17 00:00:00 2001 From: Matt Klein Date: Sat, 24 Sep 2016 09:13:21 -0700 Subject: [PATCH 09/12] Update load_balancing.rst --- docs/intro/arch_overview/load_balancing.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/intro/arch_overview/load_balancing.rst b/docs/intro/arch_overview/load_balancing.rst index 6d0ba777fb2e..998a85b4697b 100644 --- a/docs/intro/arch_overview/load_balancing.rst +++ b/docs/intro/arch_overview/load_balancing.rst @@ -53,10 +53,10 @@ the cluster as load increases. .. _arch_overview_load_balancing_zone_panic_threshold: -Zone panic threshold --------------------- +Zone aware routing and local zone panic threshold +------------------------------------------------- -When Envoy performs zone aware routing it will send traffic to the same upstream zone -only if percent of healthy hosts in the given upstream zone is bigger than this threshold. -Default panic threshold is 80%. This is :ref:`configurable ` -via runtime. +By default Envoy performs zone aware routing where it will send traffic to the same upstream zone. +This is only done if the zones are well balanced (defaults to 3% allowed deviation) and if there +are enough healthy hosts in the local zone (the *panic threshold* which defaults to 80%). These are +:ref:`configurable ` via runtime. From 288ae95fa91ae821a7b69fed3da62955f0fcd155 Mon Sep 17 00:00:00 2001 From: Matt Klein Date: Sat, 24 Sep 2016 09:13:42 -0700 Subject: [PATCH 10/12] Update cluster_runtime.rst --- docs/configuration/cluster_manager/cluster_runtime.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index 188b24830b4f..ab3d069caf63 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -38,7 +38,7 @@ upstream.zone_routing.enabled upstream.zone_routing.percent_diff Zone aware routing will be used only if the percent of upstream hosts in the same zone is within percent_diff of expected. Expected is calculated as 100 / number_of_zones. This prevents Envoy - from using same zone routing if the zones are not balanced well. + from using same zone routing if the zones are not balanced well. Defaults to 3% allowed deviation. upstream.zone_routing.healthy_panic_threshold Defines the :ref:`zone healthy panic threshold ` From 7cf16cb135054a6a3cb16959c1a4860e6af945be Mon Sep 17 00:00:00 2001 From: Roman Dzhabarov Date: Sat, 24 Sep 2016 13:58:09 -0700 Subject: [PATCH 11/12] Early exit + test. --- source/common/upstream/load_balancer_impl.cc | 50 +++++++++---------- .../upstream/load_balancer_impl_test.cc | 35 +++++++++++-- 2 files changed, 55 insertions(+), 30 deletions(-) diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index 25553ab5cdd6..87945c748318 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -25,39 +25,39 @@ const std::vector& LoadBalancerBase::hostsToUse() { return host_set_.hosts(); } - // Attempt to do zone aware routing if there are at least 2 upstream zones and it's enabled. - if (stats_.upstream_zone_count_.value() > 1 && - runtime_.snapshot().featureEnabled("upstream.zone_routing.enabled", 100)) { - double zone_to_all_percent = - 100.0 * host_set_.localZoneHealthyHosts().size() / host_set_.healthyHosts().size(); - double expected_percent = 100.0 / stats_.upstream_zone_count_.value(); + // Early exit if we cannot perform zone aware routing. + if (stats_.upstream_zone_count_.value() < 2 || host_set_.localZoneHealthyHosts().empty() || + !runtime_.snapshot().featureEnabled("upstream.zone_routing.enabled", 100)) { + return host_set_.healthyHosts(); + } - uint64_t zone_percent_diff = - runtime_.snapshot().getInteger("upstream.zone_routing.percent_diff", 3); + double zone_to_all_percent = + 100.0 * host_set_.localZoneHealthyHosts().size() / host_set_.healthyHosts().size(); + double expected_percent = 100.0 / stats_.upstream_zone_count_.value(); - // Hosts should be roughly equally distributed between zones. - if (std::abs(zone_to_all_percent - expected_percent) > zone_percent_diff) { - stats_.upstream_zone_above_threshold_.inc(); + uint64_t zone_percent_diff = + runtime_.snapshot().getInteger("upstream.zone_routing.percent_diff", 3); - return host_set_.healthyHosts(); - } else { - stats_.upstream_zone_within_threshold_.inc(); - } + // Hosts should be roughly equally distributed between zones. + if (std::abs(zone_to_all_percent - expected_percent) > zone_percent_diff) { + stats_.upstream_zone_above_threshold_.inc(); - uint64_t zone_panic_threshold = - runtime_.snapshot().getInteger("upstream.zone_routing.healthy_panic_threshold", 80); - double zone_healthy_percent = - 100.0 * host_set_.localZoneHealthyHosts().size() / host_set_.localZoneHosts().size(); - if (zone_healthy_percent < zone_panic_threshold) { - stats_.upstream_zone_healthy_panic_.inc(); + return host_set_.healthyHosts(); + } - return host_set_.healthyHosts(); - } + stats_.upstream_zone_within_threshold_.inc(); + + uint64_t zone_panic_threshold = + runtime_.snapshot().getInteger("upstream.zone_routing.healthy_panic_threshold", 80); + double zone_healthy_percent = + 100.0 * host_set_.localZoneHealthyHosts().size() / host_set_.localZoneHosts().size(); + if (zone_healthy_percent < zone_panic_threshold) { + stats_.upstream_zone_healthy_panic_.inc(); - return host_set_.localZoneHealthyHosts(); + return host_set_.healthyHosts(); } - return host_set_.healthyHosts(); + return host_set_.localZoneHealthyHosts(); } ConstHostPtr RoundRobinLoadBalancer::chooseHost() { diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index fa7e4334924b..5f7ccdb1839a 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -132,17 +132,42 @@ TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotHealthy) { EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) .WillRepeatedly(Return(true)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) + .WillRepeatedly(Return(50)); + + // Should not be called due to early exit. EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) - .WillRepeatedly(Return(80)); - EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)) - .WillRepeatedly(Return(2)); + .Times(0); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)).Times(0); + + // local zone has no healthy hosts, take from the all healthy hosts. + EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost()); + EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost()); +} + +TEST_F(RoundRobinLoadBalancerTest, ZoneAwareRoutingNotEnoughHealthy) { + cluster_.healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")}; + cluster_.hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:80"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81"), + newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:82")}; + cluster_.local_zone_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; + cluster_.local_zone_healthy_hosts_ = {newTestHost(Upstream::MockCluster{}, "tcp://127.0.0.1:81")}; + stats_.upstream_zone_count_.set(2UL); + + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) + .WillRepeatedly(Return(true)); EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) .WillRepeatedly(Return(50)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.healthy_panic_threshold", 80)) + .WillRepeatedly(Return(80)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.percent_diff", 3)) + .WillRepeatedly(Return(3)); - // There is only one host in the given zone for zone aware routing. + // Not enough healthy hosts in local zone. EXPECT_EQ(cluster_.healthy_hosts_[0], lb_.chooseHost()); EXPECT_EQ(1UL, stats_.upstream_zone_above_threshold_.value()); - EXPECT_EQ(cluster_.healthy_hosts_[1], lb_.chooseHost()); EXPECT_EQ(2UL, stats_.upstream_zone_above_threshold_.value()); } From af719b634af8e4312ff82b4e428a43e85843ba8f Mon Sep 17 00:00:00 2001 From: Roman Dzhabarov Date: Mon, 26 Sep 2016 10:08:11 -0700 Subject: [PATCH 12/12] Fixed comments. --- source/common/upstream/load_balancer_impl.cc | 3 +-- source/common/upstream/upstream_impl.cc | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index 87945c748318..c03e3c0daf06 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -16,8 +16,7 @@ const std::vector& LoadBalancerBase::hostsToUse() { uint64_t global_panic_threshold = std::min(100UL, runtime_.snapshot().getInteger("upstream.healthy_panic_threshold", 50)); - double healthy_percent = - (host_set_.healthyHosts().size() / static_cast(host_set_.hosts().size())) * 100; + double healthy_percent = 100.0 * host_set_.healthyHosts().size() / host_set_.hosts().size(); // If the % of healthy hosts in the cluster is less than our panic threshold, we use all hosts. if (healthy_percent < global_panic_threshold) { diff --git a/source/common/upstream/upstream_impl.cc b/source/common/upstream/upstream_impl.cc index c9e647e38b07..549376349d49 100644 --- a/source/common/upstream/upstream_impl.cc +++ b/source/common/upstream/upstream_impl.cc @@ -186,11 +186,10 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const std::vector& n for (HostPtr host : new_hosts) { bool found = false; for (auto i = current_hosts.begin(); i != current_hosts.end();) { - zones.insert((*i)->zone()); - // If we find a host matched based on URL, we keep it. However we do change weight inline so // do that here. if ((*i)->url() == host->url()) { + zones.insert((*i)->zone()); if (host->weight() > max_host_weight) { max_host_weight = host->weight(); }