Skip to content

Commit

Permalink
Added max-unhealthy-endpoints-ratio cmdline parameter for PHC
Browse files Browse the repository at this point in the history
Signed-off-by: Roman Zavodskikh <roman.zavodskikh@zalando.de>
  • Loading branch information
Roman Zavodskikh committed May 17, 2024
1 parent 2600415 commit 9aaf368
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 69 deletions.
12 changes: 9 additions & 3 deletions docs/operation/operation.md
Original file line number Diff line number Diff line change
Expand Up @@ -904,15 +904,20 @@ the Skipper takes a look at previous period and if the amount of requests in the
and failed requests ratio is more than `min-drop-probability` for the given endpoints
then Skipper will send reduced (the more `max-drop-probability` and failed requests ratio
in previous period are, the stronger reduction is) amount of requests compared to amount sent without PHC.
If the ratio of unhealthy endpoints is more than `max-unhealthy-endpoints-ratio` then PHC becomes fail-open. This effectively means
if there are too many unhealthy endpoints PHC does not try to mitigate them any more and requests are sent like there is no PHC at all.

Having this, we expect less requests to fail because a lot of them would be sent to endpoints that seem to be healthy instead.

To enable this feature, you need to provide `-passive-health-check` option having forementioned parameters
(`period`, `min-requests`, `min-drop-probability`, `max-drop-probability`) defined.
(`period`, `min-requests`, `min-drop-probability`, `max-drop-probability`, `max-unhealthy-endpoints-ratio`) defined.
`period`, `min-requests`, `max-drop-probability` are required parameters, it is not possible for PHC to be enabled without
them explicitly defined by user. `min-drop-probability` is implicitly defined as `0.0` if not explicitly set by user.
`max-unhealthy-endpoints-ratio` is defined as `1.0` if not explicitly set by user.
Valid examples of `-passive-health-check` are:

+ `-passive-health-check=period=1s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9,max-unhealthy-endpoints-ratio=0.3`
+ `-passive-health-check=period=1s,min-requests=10,max-drop-probability=0.9,max-unhealthy-endpoints-ratio=0.3`
+ `-passive-health-check=period=1s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9`
+ `-passive-health-check=period=1s,min-requests=10,max-drop-probability=0.9`

Expand All @@ -923,9 +928,10 @@ The parameters of `-passive-health-check` option are:

+ `period=<duration>` - the duration of stats reset period
+ `min-requests=<int>` - the minimum number of requests per `period` per backend endpoint required to activate PHC for this endpoint
+ `min-drop-probabilty=<0.0 <= p <= 1.0>` - the minimum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request. The same value is in fact used as minimal failed requests ratio for PHC to be enabled for this endpoint
+ `max-drop-probabilty=<0.0 <= p <= 1.0>` - the maximum possible probability of unhealthy endpoint being not considered
+ `min-drop-probabilty=[0.0 <= p < max-drop-probability)` - the minimum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request. The same value is in fact used as minimal failed requests ratio for PHC to be enabled for this endpoint
+ `max-drop-probabilty=(min-drop-probability < p <= 1.0]` - the maximum possible probability of unhealthy endpoint being not considered
while choosing the endpoint for the given request
+ `max-unhealthy-endpoints-ratio=[0.0 <= r <= 1.0]` - the maximum ratio of unhealthy endpoints for PHC to try to mitigate ongoing requests

### Metrics

Expand Down
6 changes: 6 additions & 0 deletions proxy/healthy_endpoints.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,22 @@ func (h *healthyEndpoints) filterHealthyEndpoints(ctx *context, endpoints []rout

p := h.rnd.Float64()

unhealthyEndpointsCount := 0
filtered := make([]routing.LBEndpoint, 0, len(endpoints))
for _, e := range endpoints {
dropProbability := e.Metrics.HealthCheckDropProbability()
if p < dropProbability {
ctx.Logger().Infof("Dropping endpoint %q due to passive health check: p=%0.2f, dropProbability=%0.2f",
e.Host, p, dropProbability)
metrics.IncCounter("passive-health-check.endpoints.dropped")
unhealthyEndpointsCount++
} else {
filtered = append(filtered, e)
}

if float64(unhealthyEndpointsCount) > float64(len(endpoints))*e.Metrics.MaxUnhealthyEndpointsRatio() {
return endpoints
}
}

if len(filtered) == 0 {
Expand Down
31 changes: 31 additions & 0 deletions proxy/healthy_endpoints_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ func defaultEndpointRegistry() *routing.EndpointRegistry {
MinRequests: 2,
MaxHealthCheckDropProbability: 0.95,
MinHealthCheckDropProbability: 0.01,
MaxUnhealthyEndpointsRatio: 1.0,
})
}

Expand Down Expand Up @@ -210,6 +211,7 @@ func TestPHC(t *testing.T) {
MinRequests: 1, // with 2 test case fails on github actions
MaxHealthCheckDropProbability: 0.95,
MinHealthCheckDropProbability: 0.01,
MaxUnhealthyEndpointsRatio: 1.0,
})
mockMetrics, ps := setupProxyWithCustomEndpointRegisty(t, fmt.Sprintf(`* -> backendTimeout("5ms") -> consistentHashKey("${request.header.ConsistentHashKey}") -> <consistentHash, %s>`,
servicesString), consistantHashCustomEndpointRegistry)
Expand All @@ -228,6 +230,7 @@ func TestPHC(t *testing.T) {
MinRequests: 1, // with 2 test case fails on github actions
MaxHealthCheckDropProbability: 0.95,
MinHealthCheckDropProbability: 0.01,
MaxUnhealthyEndpointsRatio: 1.0,
})
mockMetrics, ps := setupProxyWithCustomEndpointRegisty(t, fmt.Sprintf(`* -> backendTimeout("5ms") -> consistentHashKey("${request.header.ConsistentHashKey}") -> consistentHashBalanceFactor(1.25) -> <consistentHash, %s>`,
servicesString), consistantHashCustomEndpointRegistry)
Expand All @@ -241,3 +244,31 @@ func TestPHC(t *testing.T) {
})
}
}

func TestPHCNoHealthyEndpoints(t *testing.T) {
const (
healthy = 0
unhealthy = 4
)

servicesString := setupServices(t, healthy, unhealthy)
mockMetrics, ps := setupProxy(t, fmt.Sprintf(`* -> backendTimeout("20ms") -> <random, %s>`,
servicesString))
va := fireVegeta(t, ps, 5000, 1*time.Second, 5*time.Second)

count200, ok := va.CountStatus(200)
assert.False(t, ok)

count504, ok := va.CountStatus(504)
assert.True(t, ok)

failedRequests := math.Abs(float64(va.TotalRequests())) - float64(count200)
t.Logf("total requests: %d, count200: %d, count504: %d, failedRequests: %f", va.TotalRequests(), count200, count504, failedRequests)

assert.InDelta(t, float64(count504), failedRequests, 5)
assert.InDelta(t, float64(va.TotalRequests()), float64(failedRequests), 0.1*float64(va.TotalRequests()))
mockMetrics.WithCounters(func(counters map[string]int64) {
assert.InDelta(t, float64(unhealthy)*float64(va.TotalRequests()), float64(counters["passive-health-check.endpoints.dropped"]), 0.3*float64(unhealthy)*float64(va.TotalRequests()))
assert.InDelta(t, 0.0, float64(counters["passive-health-check.requests.passed"]), 0.3*float64(nRequests)) // allow 30% error
})
}
18 changes: 17 additions & 1 deletion proxy/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,21 @@ type PassiveHealthCheck struct {

// The maximum probability of unhealthy endpoint to be dropped out from load balancing for every specific request
MaxDropProbability float64

// MaxUnhealthyEndpointsRatio is the maximum ratio of unhealthy endpoints in the list of all endpoints PHC will check
// in case of all endpoints are unhealthy
MaxUnhealthyEndpointsRatio float64
}

func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, error) {
if len(o) == 0 {
return false, &PassiveHealthCheck{}, nil
}

result := &PassiveHealthCheck{}
result := &PassiveHealthCheck{
MinDropProbability: 0.0,
MaxUnhealthyEndpointsRatio: 1.0,
}
requiredParams := map[string]struct{}{
"period": {},
"max-drop-probability": {},
Expand Down Expand Up @@ -218,6 +225,15 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e
return false, nil, fmt.Errorf("passive health check: invalid minDropProbability value: %s", value)
}
result.MinDropProbability = minDropProbability
case "max-unhealthy-endpoints-ratio":
maxUnhealthyEndpointsRatio, err := strconv.ParseFloat(value, 64)
if err != nil {
return false, nil, fmt.Errorf("passive health check: invalid maxUnhealthyEndpointsRatio value: %q", value)
}
if maxUnhealthyEndpointsRatio < 0 || maxUnhealthyEndpointsRatio > 1 {
return false, nil, fmt.Errorf("passive health check: invalid maxUnhealthyEndpointsRatio value: %q", value)
}
result.MaxUnhealthyEndpointsRatio = maxUnhealthyEndpointsRatio
default:
return false, nil, fmt.Errorf("passive health check: invalid parameter: key=%s,value=%s", key, value)
}
Expand Down
Loading

0 comments on commit 9aaf368

Please sign in to comment.