Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: endpoint choose by health check #109

Merged
merged 16 commits into from
Jan 25, 2021
64 changes: 64 additions & 0 deletions health_check.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Etcd Cluster Health Check

## Synopsis
tzssangglass marked this conversation as resolved.
Show resolved Hide resolved

```nginx
http {
# required declares a shared memory zone to store endpoints's health status
lua_shared_dict healthcheck_shm 1m;

server {
location = /healthcheck {
content_by_lua_block {
# the health check feature is optional, and can be enabled with the following configuration.
local health_check, err = require("resty.etcd.health_check").init({
shm_name = "healthcheck_shm",
fail_timeout = 10,
max_fails = 1,
})

local etcd, err = require("resty.etcd").new({
protocol = "v3",
http_host = {
"http://127.0.0.1:12379",
"http://127.0.0.1:22379",
"http://127.0.0.1:32379",
},
user = 'root',
password = 'abc123',
})
}
}
}
}
```

## Description

Implement a passive health check mechanism, when the connection/read/write fails occurs, recorded as an endpoint' failure.
tzssangglass marked this conversation as resolved.
Show resolved Hide resolved

## Config

| name | Type | Requirement | Default | Description |
| ------------ | ------- | ----------- | ------- | ------------------------------------------------------------ |
| shm_name | string | required | | the declarative `lua_shared_dict` is used to store the health status of endpoints. |
| fail_timeout | integer | optional | 10 | sets the time during which a number of failed attempts must happen for the endpoint to be marked unavailable, and also the time for which the endpoint is marked unavailable. |
tzssangglass marked this conversation as resolved.
Show resolved Hide resolved
| max_fails | integer | optional | 1 | ets the number of failed attempts that must occur during the `fail_timeout` period for the endpoint to be marked unavailable. |
tzssangglass marked this conversation as resolved.
Show resolved Hide resolved

lua example:

```lua
local health_check, err = require("resty.etcd.health_check").init({
shm_name = "healthcheck_shm",
fail_timeout = 10,
max_fails = 1,
})
```

In a `fail_timeout`, if there are `max_fails` consecutive failures, the endpoint is marked as unhealthy, the unhealthy endpoint will not be choosed to connect for a `fail_timeout` time in the future.

Health check mechanism would switch endpoint only when the previously choosed endpoint is marked as unhealthy.

The failure counter and health status of each etcd endpoint are shared across workers and by different etcd clients.

Also note that the `fail_timeout` and `max_fails` of the health check cannot be changed once it has been created.
85 changes: 85 additions & 0 deletions lib/resty/etcd/health_check.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
local ngx_shared = ngx.shared
local utils = require("resty.etcd.utils")
local conf

local _M = {}

local function gen_unhealthy_key(etcd_host)
return "unhealthy-" .. etcd_host
end

local function is_healthy(etcd_host)
if conf == nil then
return
end

local unhealthy_key = gen_unhealthy_key(etcd_host)
local unhealthy_endpoint, err = ngx_shared[conf.shm_name]:get(unhealthy_key)
if err then
utils.log_warn("failed to get unhealthy_key: ",
unhealthy_key, " err: ", err)
return
end

if not unhealthy_endpoint then
return true
end

return false
end
_M.is_healthy = is_healthy
tzssangglass marked this conversation as resolved.
Show resolved Hide resolved


local function fault_count(key, shm_name, fail_timeout)
local new_value, err, forcible = ngx_shared[shm_name]:incr(key, 1, 0, fail_timeout)
if err then
return nil, err
end

if forcible then
utils.log_warn("shared dict: ", shm_name, " is full, valid items forcibly overwritten")
end
return new_value, nil
end


local function report_fault(etcd_host)
if conf == nil then
return
end

local fails, err = fault_count(etcd_host, conf.shm_name, conf.fail_timeout)
if err then
utils.log_error("failed to incr etcd endpoint fail times: ", err)
return
end

if fails >= conf.max_fails then
local unhealthy_key = gen_unhealthy_key(etcd_host)
local unhealthy_endpoint, _ = ngx_shared[conf.shm_name]:get(unhealthy_key)
if unhealthy_endpoint == nil then
ngx_shared[conf.shm_name]:set(unhealthy_key, etcd_host,
conf.fail_timeout)
utils.log_warn("update endpoint: ", etcd_host, " to unhealthy")
end
end
end
_M.report_fault = report_fault
tzssangglass marked this conversation as resolved.
Show resolved Hide resolved


function _M.init(opts)
if conf == nil then
conf = {}
local shared_dict = ngx_shared[opts.shm_name]
if not shared_dict then
return nil, "failed to get ngx.shared dict: " .. opts.shm_name
end
conf.shm_name = opts.shm_name
conf.fail_timeout = opts.fail_timeout or 10 -- 10 sec
conf.max_fails = opts.max_fails or 1
_M.conf = conf
return _M, nil
end
end

return _M
8 changes: 8 additions & 0 deletions lib/resty/etcd/utils.lua
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ end
local ngx_log = ngx.log
local ngx_ERR = ngx.ERR
local ngx_INFO = ngx.INFO
local ngx_WARN = ngx.WARN
local function log_error(...)
return ngx_log(ngx_ERR, ...)
end
Expand All @@ -95,6 +96,13 @@ local function log_info( ... )
end
_M.log_info = log_info


local function log_warn( ... )
return ngx_log(ngx_WARN, ...)
end
_M.log_warn = log_warn


local function verify_key(key)
if not key or #key == 0 then
return false, "key should not be empty"
Expand Down
Loading