From dc87ad2ee9e3b12ee473ad34611a282532ed38a5 Mon Sep 17 00:00:00 2001 From: Roman Zavodskikh Date: Tue, 14 May 2024 18:10:55 +0200 Subject: [PATCH] Added max-unhealthy-endpoints-ratio cmdline parameter for PHC Signed-off-by: Roman Zavodskikh --- docs/operation/operation.md | 12 +- proxy/healthy_endpoints.go | 7 ++ proxy/healthy_endpoints_test.go | 31 +++++ proxy/proxy.go | 18 ++- proxy/proxy_test.go | 197 ++++++++++++++++++++++---------- routing/endpointregistry.go | 7 ++ skipper.go | 1 + 7 files changed, 206 insertions(+), 67 deletions(-) diff --git a/docs/operation/operation.md b/docs/operation/operation.md index 37c6211e1d..c7ee3e9c5a 100644 --- a/docs/operation/operation.md +++ b/docs/operation/operation.md @@ -904,15 +904,20 @@ the Skipper takes a look at previous period and if the amount of requests in the and failed requests ratio is more than `min-drop-probability` for the given endpoints then Skipper will send reduced (the more `max-drop-probability` and failed requests ratio in previous period are, the stronger reduction is) amount of requests compared to amount sent without PHC. +If the ratio of unhealthy endpoints is more than `max-unhealthy-endpoints-ratio` then PHC becomes fail-open. This effectively means +if there are too many unhealthy endpoints PHC does not try to mitigate them any more and requests are sent like there is no PHC at all. Having this, we expect less requests to fail because a lot of them would be sent to endpoints that seem to be healthy instead. To enable this feature, you need to provide `-passive-health-check` option having forementioned parameters -(`period`, `min-requests`, `min-drop-probability`, `max-drop-probability`) defined. +(`period`, `min-requests`, `min-drop-probability`, `max-drop-probability`, `max-unhealthy-endpoints-ratio`) defined. `period`, `min-requests`, `max-drop-probability` are required parameters, it is not possible for PHC to be enabled without them explicitly defined by user. `min-drop-probability` is implicitly defined as `0.0` if not explicitly set by user. +`max-unhealthy-endpoints-ratio` is defined as `1.0` if not explicitly set by user. Valid examples of `-passive-health-check` are: ++ `-passive-health-check=period=1s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9,max-unhealthy-endpoints-ratio=0.3` ++ `-passive-health-check=period=1s,min-requests=10,max-drop-probability=0.9,max-unhealthy-endpoints-ratio=0.3` + `-passive-health-check=period=1s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9` + `-passive-health-check=period=1s,min-requests=10,max-drop-probability=0.9` @@ -923,9 +928,10 @@ The parameters of `-passive-health-check` option are: + `period=` - the duration of stats reset period + `min-requests=` - the minimum number of requests per `period` per backend endpoint required to activate PHC for this endpoint -+ `min-drop-probabilty=<0.0 <= p <= 1.0>` - the minimum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request. The same value is in fact used as minimal failed requests ratio for PHC to be enabled for this endpoint -+ `max-drop-probabilty=<0.0 <= p <= 1.0>` - the maximum possible probability of unhealthy endpoint being not considered ++ `min-drop-probabilty=[0.0 <= p < max-drop-probability)` - the minimum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request. The same value is in fact used as minimal failed requests ratio for PHC to be enabled for this endpoint ++ `max-drop-probabilty=(min-drop-probability < p <= 1.0]` - the maximum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request ++ `max-unhealthy-endpoints-ratio=[0.0 <= r <= 1.0]` - the maximum ratio of unhealthy endpoints for PHC to try to mitigate ongoing requests ### Metrics diff --git a/proxy/healthy_endpoints.go b/proxy/healthy_endpoints.go index 00e1ef5db8..bb211ac4fe 100644 --- a/proxy/healthy_endpoints.go +++ b/proxy/healthy_endpoints.go @@ -19,6 +19,8 @@ func (h *healthyEndpoints) filterHealthyEndpoints(ctx *context, endpoints []rout p := h.rnd.Float64() + unhealthyEndpointsCount := 0 + maxUnhealthyEndpointsCount := float64(len(endpoints)) * h.endpointRegistry.MaxUnhealthyEndpointsRatio() filtered := make([]routing.LBEndpoint, 0, len(endpoints)) for _, e := range endpoints { dropProbability := e.Metrics.HealthCheckDropProbability() @@ -26,9 +28,14 @@ func (h *healthyEndpoints) filterHealthyEndpoints(ctx *context, endpoints []rout ctx.Logger().Infof("Dropping endpoint %q due to passive health check: p=%0.2f, dropProbability=%0.2f", e.Host, p, dropProbability) metrics.IncCounter("passive-health-check.endpoints.dropped") + unhealthyEndpointsCount++ } else { filtered = append(filtered, e) } + + if float64(unhealthyEndpointsCount) > maxUnhealthyEndpointsCount { + return endpoints + } } if len(filtered) == 0 { diff --git a/proxy/healthy_endpoints_test.go b/proxy/healthy_endpoints_test.go index 01cea436b0..42736f4468 100644 --- a/proxy/healthy_endpoints_test.go +++ b/proxy/healthy_endpoints_test.go @@ -29,6 +29,7 @@ func defaultEndpointRegistry() *routing.EndpointRegistry { MinRequests: 2, MaxHealthCheckDropProbability: 0.95, MinHealthCheckDropProbability: 0.01, + MaxUnhealthyEndpointsRatio: 1.0, }) } @@ -210,6 +211,7 @@ func TestPHC(t *testing.T) { MinRequests: 1, // with 2 test case fails on github actions MaxHealthCheckDropProbability: 0.95, MinHealthCheckDropProbability: 0.01, + MaxUnhealthyEndpointsRatio: 1.0, }) mockMetrics, ps := setupProxyWithCustomEndpointRegisty(t, fmt.Sprintf(`* -> backendTimeout("5ms") -> consistentHashKey("${request.header.ConsistentHashKey}") -> `, servicesString), consistantHashCustomEndpointRegistry) @@ -228,6 +230,7 @@ func TestPHC(t *testing.T) { MinRequests: 1, // with 2 test case fails on github actions MaxHealthCheckDropProbability: 0.95, MinHealthCheckDropProbability: 0.01, + MaxUnhealthyEndpointsRatio: 1.0, }) mockMetrics, ps := setupProxyWithCustomEndpointRegisty(t, fmt.Sprintf(`* -> backendTimeout("5ms") -> consistentHashKey("${request.header.ConsistentHashKey}") -> consistentHashBalanceFactor(1.25) -> `, servicesString), consistantHashCustomEndpointRegistry) @@ -241,3 +244,31 @@ func TestPHC(t *testing.T) { }) } } + +func TestPHCNoHealthyEndpoints(t *testing.T) { + const ( + healthy = 0 + unhealthy = 4 + ) + + servicesString := setupServices(t, healthy, unhealthy) + mockMetrics, ps := setupProxy(t, fmt.Sprintf(`* -> backendTimeout("20ms") -> `, + servicesString)) + va := fireVegeta(t, ps, 5000, 1*time.Second, 5*time.Second) + + count200, ok := va.CountStatus(200) + assert.False(t, ok) + + count504, ok := va.CountStatus(504) + assert.True(t, ok) + + failedRequests := math.Abs(float64(va.TotalRequests())) - float64(count200) + t.Logf("total requests: %d, count200: %d, count504: %d, failedRequests: %f", va.TotalRequests(), count200, count504, failedRequests) + + assert.InDelta(t, float64(count504), failedRequests, 5) + assert.InDelta(t, float64(va.TotalRequests()), float64(failedRequests), 0.1*float64(va.TotalRequests())) + mockMetrics.WithCounters(func(counters map[string]int64) { + assert.InDelta(t, float64(unhealthy)*float64(va.TotalRequests()), float64(counters["passive-health-check.endpoints.dropped"]), 0.3*float64(unhealthy)*float64(va.TotalRequests())) + assert.InDelta(t, 0.0, float64(counters["passive-health-check.requests.passed"]), 0.3*float64(nRequests)) // allow 30% error + }) +} diff --git a/proxy/proxy.go b/proxy/proxy.go index cc5061954a..4972768a8c 100644 --- a/proxy/proxy.go +++ b/proxy/proxy.go @@ -162,6 +162,10 @@ type PassiveHealthCheck struct { // The maximum probability of unhealthy endpoint to be dropped out from load balancing for every specific request MaxDropProbability float64 + + // MaxUnhealthyEndpointsRatio is the maximum ratio of unhealthy endpoints in the list of all endpoints PHC will check + // in case of all endpoints are unhealthy + MaxUnhealthyEndpointsRatio float64 } func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, error) { @@ -169,7 +173,10 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e return false, &PassiveHealthCheck{}, nil } - result := &PassiveHealthCheck{} + result := &PassiveHealthCheck{ + MinDropProbability: 0.0, + MaxUnhealthyEndpointsRatio: 1.0, + } requiredParams := map[string]struct{}{ "period": {}, "max-drop-probability": {}, @@ -218,6 +225,15 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e return false, nil, fmt.Errorf("passive health check: invalid minDropProbability value: %s", value) } result.MinDropProbability = minDropProbability + case "max-unhealthy-endpoints-ratio": + maxUnhealthyEndpointsRatio, err := strconv.ParseFloat(value, 64) + if err != nil { + return false, nil, fmt.Errorf("passive health check: invalid maxUnhealthyEndpointsRatio value: %q", value) + } + if maxUnhealthyEndpointsRatio < 0 || maxUnhealthyEndpointsRatio > 1 { + return false, nil, fmt.Errorf("passive health check: invalid maxUnhealthyEndpointsRatio value: %q", value) + } + result.MaxUnhealthyEndpointsRatio = maxUnhealthyEndpointsRatio default: return false, nil, fmt.Errorf("passive health check: invalid parameter: key=%s,value=%s", key, value) } diff --git a/proxy/proxy_test.go b/proxy/proxy_test.go index 51780a4f5d..b9c080b188 100644 --- a/proxy/proxy_test.go +++ b/proxy/proxy_test.go @@ -2296,10 +2296,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "somethingInvalid", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", + "period": "somethingInvalid", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2307,26 +2308,29 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: true, expectedParams: &PassiveHealthCheck{ - Period: 1 * time.Minute, - MinRequests: 10, - MaxDropProbability: 0.9, - MinDropProbability: 0.05, + Period: 1 * time.Minute, + MinRequests: 10, + MaxDropProbability: 0.9, + MinDropProbability: 0.05, + MaxUnhealthyEndpointsRatio: 0.3, }, expectedError: nil, }, { inputArg: map[string]string{ - "period": "-1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", + "period": "-1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2334,10 +2338,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "somethingInvalid", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "somethingInvalid", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2345,10 +2350,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "-10", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "-10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2356,10 +2362,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "somethingInvalid", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "somethingInvalid", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2367,10 +2374,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "-0.1", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "-0.1", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2378,10 +2386,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "3.1415", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "3.1415", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2389,10 +2398,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "somethingInvalid", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "somethingInvalid", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2400,10 +2410,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "-0.1", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "-0.1", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2411,10 +2422,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "3.1415", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "3.1415", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2422,10 +2434,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.05", - "min-drop-probability": "0.9", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.05", + "min-drop-probability": "0.9", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2433,11 +2446,48 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", - "non-existing": "non-existing", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "-0.1", + }, + expectedEnabled: false, + expectedParams: nil, + expectedError: fmt.Errorf(`passive health check: invalid maxUnhealthyEndpointsRatio value: "-0.1"`), + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "3.1415", + }, + expectedEnabled: false, + expectedParams: nil, + expectedError: fmt.Errorf(`passive health check: invalid maxUnhealthyEndpointsRatio value: "3.1415"`), + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "somethingInvalid", + }, + expectedEnabled: false, + expectedParams: nil, + expectedError: fmt.Errorf(`passive health check: invalid maxUnhealthyEndpointsRatio value: "somethingInvalid"`), + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", + "non-existing": "non-existing", }, expectedEnabled: false, expectedParams: nil, @@ -2448,7 +2498,8 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "10", /* forgot max-drop-probability */ - "min-drop-probability": "0.05", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2459,14 +2510,34 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "10", "max-drop-probability": "0.9", - /* using default max-drop-probability */ + /* using default min-drop-probability */ + "max-unhealthy-endpoints-ratio": "0.3", + }, + expectedEnabled: true, + expectedParams: &PassiveHealthCheck{ + Period: 1 * time.Minute, + MinRequests: 10, + MaxDropProbability: 0.9, + MinDropProbability: 0.0, + MaxUnhealthyEndpointsRatio: 0.3, + }, + expectedError: nil, + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + /* using default max-unhealthy-endpoints-ratio */ }, expectedEnabled: true, expectedParams: &PassiveHealthCheck{ - Period: 1 * time.Minute, - MinRequests: 10, - MaxDropProbability: 0.9, - MinDropProbability: 0.0, + Period: 1 * time.Minute, + MinRequests: 10, + MaxDropProbability: 0.9, + MinDropProbability: 0.05, + MaxUnhealthyEndpointsRatio: 1.0, }, expectedError: nil, }, diff --git a/routing/endpointregistry.go b/routing/endpointregistry.go index 4609d7bc96..2512e43a76 100644 --- a/routing/endpointregistry.go +++ b/routing/endpointregistry.go @@ -100,6 +100,7 @@ type EndpointRegistry struct { minRequests int64 minHealthCheckDropProbability float64 maxHealthCheckDropProbability float64 + maxUnhealthyEndpointsRatio float64 quit chan struct{} @@ -116,6 +117,7 @@ type RegistryOptions struct { MinRequests int64 MinHealthCheckDropProbability float64 MaxHealthCheckDropProbability float64 + MaxUnhealthyEndpointsRatio float64 } func (r *EndpointRegistry) Do(routes []*Route) []*Route { @@ -205,6 +207,7 @@ func NewEndpointRegistry(o RegistryOptions) *EndpointRegistry { minRequests: o.MinRequests, minHealthCheckDropProbability: o.MinHealthCheckDropProbability, maxHealthCheckDropProbability: o.MaxHealthCheckDropProbability, + maxUnhealthyEndpointsRatio: o.MaxUnhealthyEndpointsRatio, quit: make(chan struct{}), @@ -227,6 +230,10 @@ func (r *EndpointRegistry) GetMetrics(hostPort string) Metrics { return e.(*entry) } +func (r *EndpointRegistry) MaxUnhealthyEndpointsRatio() float64 { + return r.maxUnhealthyEndpointsRatio +} + func (r *EndpointRegistry) allMetrics() map[string]Metrics { result := make(map[string]Metrics) r.data.Range(func(k, v any) bool { diff --git a/skipper.go b/skipper.go index 4bcf0336f1..b1e7eaf3a0 100644 --- a/skipper.go +++ b/skipper.go @@ -1947,6 +1947,7 @@ func run(o Options, sig chan os.Signal, idleConnsCH chan struct{}) error { MinRequests: passiveHealthCheck.MinRequests, MinHealthCheckDropProbability: passiveHealthCheck.MinDropProbability, MaxHealthCheckDropProbability: passiveHealthCheck.MaxDropProbability, + MaxUnhealthyEndpointsRatio: passiveHealthCheck.MaxUnhealthyEndpointsRatio, }) ro := routing.Options{ FilterRegistry: o.filterRegistry(),