From 6cd3a793bdde2b3afca04631303043030810f759 Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Thu, 31 Dec 2020 08:09:35 +0800 Subject: [PATCH 01/15] kip the CI error that not relevant to this PR --- lib/resty/etcd/health_check.lua | 21 ++++++++++++ t/v3/health_check.t | 61 +++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 lib/resty/etcd/health_check.lua create mode 100644 t/v3/health_check.t diff --git a/lib/resty/etcd/health_check.lua b/lib/resty/etcd/health_check.lua new file mode 100644 index 00000000..0b442e08 --- /dev/null +++ b/lib/resty/etcd/health_check.lua @@ -0,0 +1,21 @@ +local ngx_shared = ngx.shared + +local _M = {} + +local mt = { __index = _M } + +function _M.new(opts) + local shared_dict = ngx_shared[opts.shm_name] + if not shared_dict then + return nil, "failed to get ngx.shared dict: " .. opts.shm_name + end + opts.fail_timeout = opts.fail_timeout or 10 -- 10 sec + opts.max_fails = opts.max_fails or 1 + return setmetatable({ + shm_name = opts.shm_name, + fail_timeout = opts.fail_timeout, + max_fails = opts.max_fails, + }, + mt) +end +return _M \ No newline at end of file diff --git a/t/v3/health_check.t b/t/v3/health_check.t new file mode 100644 index 00000000..fc2c0e46 --- /dev/null +++ b/t/v3/health_check.t @@ -0,0 +1,61 @@ +use Test::Nginx::Socket::Lua; + +log_level('info'); +no_long_string(); +repeat_each(1); +workers(2); + +my $etcd_version = `etcd --version`; +if ($etcd_version =~ /^etcd Version: 2/ || $etcd_version =~ /^etcd Version: 3.1./ || $etcd_version =~ /^etcd Version: 3.2./) { + plan(skip_all => "etcd is too old, skip v3 protocol"); +} else { + my $enable_tls = $ENV{ETCD_ENABLE_TLS}; + if ($enable_tls eq "TRUE") { + plan(skip_all => "skip test cases with auth when TLS is enabled"); + } else { + plan 'no_plan'; + } +} + +our $HttpConfig = <<'_EOC_'; + lua_socket_log_errors off; + lua_package_path 'lib/?.lua;/usr/local/share/lua/5.3/?.lua;/usr/share/lua/5.1/?.lua;/usr/local/lua-resty-etcd/deps/share/lua/5.1/?.lua;/usr/local/lua-resty-etcd/deps/share/lua/5.1/?/?.lua;;'; + lua_shared_dict etcd_cluster_health_check 8m; +_EOC_ + +run_tests(); + +__DATA__ + +=== TEST 1: sanity +--- http_config eval: $::HttpConfig +--- config + location /t { + content_by_lua_block { + local health_check, err = require "resty.etcd.health_check" .new({ + shm_name = "etcd_cluster_health_check", + fail_timeout = 10, + max_fails = 1, + }) + + ngx.log(ngx.WARN, "health_check: ", require("resty.inspect")(health_check)) + local etcd, err = require "resty.etcd" .new({ + protocol = "v3", + http_host = { + "http://127.0.0.1:12379", + "http://127.0.0.1:22379", + "http://127.0.0.1:32379", + }, + user = 'root', + password = 'abc123', + }) + + ngx.say("all down") + } + } +--- request +GET /t +--- no_error_log +[error] +--- response_body +all down From 694eae84a5466c35e0d1394cc6ec82ee251ba52f Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Mon, 4 Jan 2021 08:45:38 +0800 Subject: [PATCH 02/15] save --- lib/resty/etcd/health_check.lua | 69 ++++++++++++++++++++----- lib/resty/etcd/utils.lua | 8 +++ lib/resty/etcd/v3.lua | 90 +++++++++++++++++++++------------ t/v3/health_check.t | 17 ++++--- 4 files changed, 132 insertions(+), 52 deletions(-) diff --git a/lib/resty/etcd/health_check.lua b/lib/resty/etcd/health_check.lua index 0b442e08..2a65b48b 100644 --- a/lib/resty/etcd/health_check.lua +++ b/lib/resty/etcd/health_check.lua @@ -1,21 +1,64 @@ -local ngx_shared = ngx.shared +local ngx_shared = ngx.shared +--local utils = require("resty.etcd.utils") +local checker local _M = {} -local mt = { __index = _M } +local function is_healthy(etcd_host) + ngx.log(ngx.WARN, "etcd_host: ", require("resty.inspect")(etcd_host)) +end +_M.is_healthy = is_healthy + + +local function fault_count(key, shm_name, fail_timeout) + local new_value, err, forcible = ngx_shared[shm_name]:incr(key, 1, 0, fail_timeout) + if err then + return nil, err + end + + if forcible then + utils.log_warn("shared dict: ", shm_name, " is full, valid items forcibly overwritten") + end + return new_value, nil +end + + +local function report_fault(etcd_host) + ngx.log(ngx.WARN, "report_fault: ", require("resty.inspect")("report_fault")) + + if checker == nil then + return + end + + ngx.log(ngx.WARN, "etcd_host: ", require("resty.inspect")(etcd_host)) + local fails, err = fault_count(etcd_host, checker.shm_name, checker.fail_timeout) + if err then + utils.log_error("failed to incr etcd endpoint fail times: ", err) + return + end + ngx.log(ngx.WARN, "fails: ", require("resty.inspect")(fails)) + + if fails >= checker.max_fails then + ngx.log(ngx.WARN, "fails: ", require("resty.inspect")(fails)) + end + + +end +_M.report_fault = report_fault + function _M.new(opts) - local shared_dict = ngx_shared[opts.shm_name] - if not shared_dict then - return nil, "failed to get ngx.shared dict: " .. opts.shm_name + if checker == nil then + checker = {} + local shared_dict = ngx_shared[opts.shm_name] + if not shared_dict then + return nil, "failed to get ngx.shared dict: " .. opts.shm_name + end + checker.shm_name = opts.shm_name + checker.fail_timeout = opts.fail_timeout or 10 -- 10 sec + checker.max_fails = opts.max_fails or 1 + _M.checker = checker + return _M, nil end - opts.fail_timeout = opts.fail_timeout or 10 -- 10 sec - opts.max_fails = opts.max_fails or 1 - return setmetatable({ - shm_name = opts.shm_name, - fail_timeout = opts.fail_timeout, - max_fails = opts.max_fails, - }, - mt) end return _M \ No newline at end of file diff --git a/lib/resty/etcd/utils.lua b/lib/resty/etcd/utils.lua index c9766e3f..3272d998 100644 --- a/lib/resty/etcd/utils.lua +++ b/lib/resty/etcd/utils.lua @@ -84,6 +84,7 @@ end local ngx_log = ngx.log local ngx_ERR = ngx.ERR local ngx_INFO = ngx.INFO +local ngx_WARN = ngx.WARN local function log_error(...) return ngx_log(ngx_ERR, ...) end @@ -95,6 +96,13 @@ local function log_info( ... ) end _M.log_info = log_info + +local function log_warn( ... ) + return ngx_log(ngx_WARN, ...) +end +_M.log_warn = log_warn + + local function verify_key(key) if not key or #key == 0 then return false, "key should not be empty" diff --git a/lib/resty/etcd/v3.lua b/lib/resty/etcd/v3.lua index 5e03fa5c..08904177 100644 --- a/lib/resty/etcd/v3.lua +++ b/lib/resty/etcd/v3.lua @@ -22,6 +22,7 @@ local encode_base64 = ngx.encode_base64 local decode_base64 = ngx.decode_base64 local semaphore = require("ngx.semaphore") local INIT_COUNT_RESIZE = 2e8 +local checker = require("resty.etcd.health_check") local _M = {} @@ -30,7 +31,7 @@ local mt = { __index = _M } -- define local refresh function variable local refresh_jwt_token -local function _request_uri(self, method, uri, opts, timeout, ignore_auth) +local function _request_uri(self, endpoint, method, uri, opts, timeout, ignore_auth) utils.log_info("v3 request uri: ", uri, ", timeout: ", timeout) local body @@ -76,10 +77,12 @@ local function _request_uri(self, method, uri, opts, timeout, ignore_auth) }) if err then + checker.report_fault(endpoint.http_host) return nil, err end if res.status >= 500 then + checker.report_fault(endpoint.http_host) return nil, "invalid response code: " .. res.status end @@ -198,12 +201,18 @@ local function choose_endpoint(self) return endpoints[1] end + --for _, endpoint in ipairs(endpoints) do + -- if checker.is_healthy(endpoint.http_host) then + -- return endpoint + -- end + --end + self.init_count = (self.init_count or 0) + 1 local pos = self.init_count % endpoints_len + 1 if self.init_count >= INIT_COUNT_RESIZE then self.init_count = 0 end - + ngx.log(ngx.WARN, "endpoints[pos]: ", require("resty.inspect")(endpoints[pos])) return endpoints[pos] end @@ -249,8 +258,9 @@ function refresh_jwt_token(self, timeout) password = self.password, } } - local res, err = _request_uri(self, 'POST', - choose_endpoint(self).full_prefix .. "/auth/authenticate", + local endpoint = choose_endpoint(self) + local res, err = _request_uri(self, endpoint, 'POST', + endpoint.full_prefix .. "/auth/authenticate", opts, timeout, true) self.requesting_token = false @@ -323,9 +333,10 @@ local function set(self, key, val, attr) } } + local endpoint = choose_endpoint(self) local res - res, err = _request_uri(self, 'POST', - choose_endpoint(self).full_prefix .. "/kv/put", + res, err = _request_uri(self, endpoint, 'POST', + endpoint.full_prefix .. "/kv/put", opts, self.timeout) if err then return nil, err @@ -430,9 +441,10 @@ local function get(self, key, attr) } } + local endpoint = choose_endpoint(self) local res - res, err = _request_uri(self, "POST", - choose_endpoint(self).full_prefix .. "/kv/range", + res, err = _request_uri(self, endpoint, "POST", + endpoint.full_prefix .. "/kv/range", opts, attr and attr.timeout or self.timeout) if res and res.status == 200 then @@ -471,8 +483,9 @@ local function delete(self, key, attr) }, } - return _request_uri(self, "POST", - choose_endpoint(self).full_prefix .. "/kv/deleterange", + local endpoint = choose_endpoint(self) + return _request_uri(self, endpoint, "POST", + endpoint.full_prefix .. "/kv/deleterange", opts, self.timeout) end @@ -494,13 +507,14 @@ local function txn(self, opts_arg, compare, success, failure) }, } - return _request_uri(self, "POST", - choose_endpoint(self).full_prefix .. "/kv/txn", + local endpoint = choose_endpoint(self) + return _request_uri(self, endpoint, "POST", + endpoint.full_prefix .. "/kv/txn", opts, timeout or self.timeout) end -local function request_chunk(self, method, scheme, host, port, path, opts, timeout) +local function request_chunk(self, endpoint, method, scheme, host, port, path, opts, timeout) local body, err, _ if opts and opts.body and tab_nkeys(opts.body) > 0 then body, err = encode_json(opts.body) @@ -540,6 +554,7 @@ local function request_chunk(self, method, scheme, host, port, path, opts, timeo ok, err = http_cli:connect(host, port) if not ok then + checker.report_fault(endpoint.http_host) return nil, err end @@ -591,6 +606,8 @@ local function request_chunk(self, method, scheme, host, port, path, opts, timeo body, err = decode_json(body) if not body then return nil, "failed to decode json body: " .. (err or " unkwon") + elseif body.error and body.error.http_code >= 500 then + checker.report_fault(endpoint.http_host) end if body.result.events then @@ -700,7 +717,7 @@ local function watch(self, key, attr) local endpoint = choose_endpoint(self) - local callback_fun, err, http_cli = request_chunk(self, 'POST', + local callback_fun, err, http_cli = request_chunk(self, endpoint, 'POST', endpoint.scheme, endpoint.host, endpoint.port, @@ -931,8 +948,9 @@ function _M.grant(self, ttl, id) }, } - return _request_uri(self, "POST", - choose_endpoint(self).full_prefix .. "/lease/grant", opts) + local endpoint = choose_endpoint(self) + return _request_uri(self, endpoint, "POST", + endpoint.full_prefix .. "/lease/grant", opts) end function _M.revoke(self, id) @@ -946,8 +964,9 @@ function _M.revoke(self, id) }, } - return _request_uri(self, "POST", - choose_endpoint(self).full_prefix .. "/kv/lease/revoke", opts) + local endpoint = choose_endpoint(self) + return _request_uri(self, endpoint, "POST", + endpoint.full_prefix .. "/kv/lease/revoke", opts) end function _M.keepalive(self, id) @@ -961,8 +980,9 @@ function _M.keepalive(self, id) }, } - return _request_uri(self, "POST", - choose_endpoint(self).full_prefix .. "/lease/keepalive", opts) + local endpoint = choose_endpoint(self) + return _request_uri(self, endpoint, "POST", + endpoint.full_prefix .. "/lease/keepalive", opts) end function _M.timetolive(self, id, keys) @@ -978,8 +998,9 @@ function _M.timetolive(self, id, keys) }, } - local res, err = _request_uri(self, "POST", - choose_endpoint(self).full_prefix .. "/kv/lease/timetolive", opts) + local endpoint = choose_endpoint(self) + local res, err = _request_uri(self, endpoint, "POST", + endpoint.full_prefix .. "/kv/lease/timetolive", opts) if res and res.status == 200 then if res.body.keys and tab_nkeys(res.body.keys) > 0 then @@ -993,34 +1014,39 @@ function _M.timetolive(self, id, keys) end function _M.leases(self) - return _request_uri(self, "POST", - choose_endpoint(self).full_prefix .. "/lease/leases") + local endpoint = choose_endpoint(self) + return _request_uri(self, endpoint, "POST", + endpoint.full_prefix .. "/lease/leases") end -- /version function _M.version(self) - return _request_uri(self, "GET", - choose_endpoint(self).http_host .. "/version", + local endpoint = choose_endpoint(self) + return _request_uri(self, endpoint, "GET", + endpoint.http_host .. "/version", nil, self.timeout) end -- /stats function _M.stats_leader(self) - return _request_uri(self, "GET", - choose_endpoint(self).http_host .. "/v2/stats/leader", + local endpoint = choose_endpoint(self) + return _request_uri(self, endpoint, "GET", + endpoint.http_host .. "/v2/stats/leader", nil, self.timeout) end function _M.stats_self(self) - return _request_uri(self, "GET", - choose_endpoint(self).http_host .. "/v2/stats/self", + local endpoint = choose_endpoint(self) + return _request_uri(self, endpoint, "GET", + endpoint.http_host .. "/v2/stats/self", nil, self.timeout) end function _M.stats_store(self) - return _request_uri(self, "GET", - choose_endpoint(self).http_host .. "/v2/stats/store", + local endpoint = choose_endpoint(self) + return _request_uri(self, endpoint, "GET", + endpoint.http_host .. "/v2/stats/store", nil, self.timeout) end diff --git a/t/v3/health_check.t b/t/v3/health_check.t index fc2c0e46..6044081a 100644 --- a/t/v3/health_check.t +++ b/t/v3/health_check.t @@ -39,22 +39,25 @@ __DATA__ }) ngx.log(ngx.WARN, "health_check: ", require("resty.inspect")(health_check)) - local etcd, err = require "resty.etcd" .new({ + local etcd, err = require "resty.etcd" .new({ protocol = "v3", http_host = { - "http://127.0.0.1:12379", + "http://127.0.0.1:42379", "http://127.0.0.1:22379", "http://127.0.0.1:32379", - }, - user = 'root', - password = 'abc123', + } }) - - ngx.say("all down") + local res, err = etcd:set("/health_check", { a='abc'}) + ngx.sleep(0.2) + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + ngx.log(ngx.WARN, "res.body.kvs[1].value: ", require("resty.inspect")(res.body.kvs[1].value)) + ngx.say("all down") } } --- request GET /t +--- timeout: 10 --- no_error_log [error] --- response_body From 8537596990ae74a836668284f0f4efac0787ef79 Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Tue, 5 Jan 2021 00:41:59 +0800 Subject: [PATCH 03/15] save --- lib/resty/etcd/health_check.lua | 59 ++++++++++++++++++++++----------- lib/resty/etcd/v3.lua | 24 ++++++++------ t/v3/health_check.t | 35 +++++++++++++++---- 3 files changed, 82 insertions(+), 36 deletions(-) diff --git a/lib/resty/etcd/health_check.lua b/lib/resty/etcd/health_check.lua index 2a65b48b..98cc6f55 100644 --- a/lib/resty/etcd/health_check.lua +++ b/lib/resty/etcd/health_check.lua @@ -1,11 +1,31 @@ local ngx_shared = ngx.shared ---local utils = require("resty.etcd.utils") -local checker +local utils = require("resty.etcd.utils") +local conf local _M = {} +local function gen_unhealthy_key(etcd_host) + return "unhealthy-" .. etcd_host +end + local function is_healthy(etcd_host) - ngx.log(ngx.WARN, "etcd_host: ", require("resty.inspect")(etcd_host)) + if conf == nil then + return + end + + local unhealthy_key = gen_unhealthy_key(etcd_host) + local unhealthy_endpoint, err = ngx_shared[conf.shm_name]:get(unhealthy_key) + if err then + utils.log_warn("failed to get unhealthy_key: ", + unhealthy_key, " err: ", err) + return + end + + if not unhealthy_endpoint then + return true + end + + return false end _M.is_healthy = is_healthy @@ -24,41 +44,42 @@ end local function report_fault(etcd_host) - ngx.log(ngx.WARN, "report_fault: ", require("resty.inspect")("report_fault")) - - if checker == nil then + if conf == nil then return end - ngx.log(ngx.WARN, "etcd_host: ", require("resty.inspect")(etcd_host)) - local fails, err = fault_count(etcd_host, checker.shm_name, checker.fail_timeout) + local fails, err = fault_count(etcd_host, conf.shm_name, conf.fail_timeout) if err then utils.log_error("failed to incr etcd endpoint fail times: ", err) return end - ngx.log(ngx.WARN, "fails: ", require("resty.inspect")(fails)) - if fails >= checker.max_fails then - ngx.log(ngx.WARN, "fails: ", require("resty.inspect")(fails)) + if fails >= conf.max_fails then + local unhealthy_key = gen_unhealthy_key(etcd_host) + local unhealthy_endpoint, _ = ngx_shared[conf.shm_name]:get(unhealthy_key) + if unhealthy_endpoint == nil then + ngx_shared[conf.shm_name]:set(unhealthy_key, etcd_host, + conf.fail_timeout) + utils.log_warn("update endpoint : ", etcd_host, " to unhealthy") + end end - - end _M.report_fault = report_fault function _M.new(opts) - if checker == nil then - checker = {} + if conf == nil then + conf = {} local shared_dict = ngx_shared[opts.shm_name] if not shared_dict then return nil, "failed to get ngx.shared dict: " .. opts.shm_name end - checker.shm_name = opts.shm_name - checker.fail_timeout = opts.fail_timeout or 10 -- 10 sec - checker.max_fails = opts.max_fails or 1 - _M.checker = checker + conf.shm_name = opts.shm_name + conf.fail_timeout = opts.fail_timeout or 10 -- 10 sec + conf.max_fails = opts.max_fails or 1 + _M.conf = conf return _M, nil end end + return _M \ No newline at end of file diff --git a/lib/resty/etcd/v3.lua b/lib/resty/etcd/v3.lua index 08904177..d5cca50b 100644 --- a/lib/resty/etcd/v3.lua +++ b/lib/resty/etcd/v3.lua @@ -22,7 +22,7 @@ local encode_base64 = ngx.encode_base64 local decode_base64 = ngx.decode_base64 local semaphore = require("ngx.semaphore") local INIT_COUNT_RESIZE = 2e8 -local checker = require("resty.etcd.health_check") +local health_check = require("resty.etcd.health_check") local _M = {} @@ -77,12 +77,12 @@ local function _request_uri(self, endpoint, method, uri, opts, timeout, ignore_a }) if err then - checker.report_fault(endpoint.http_host) + health_check.report_fault(endpoint.http_host) return nil, err end if res.status >= 500 then - checker.report_fault(endpoint.http_host) + health_check.report_fault(endpoint.http_host) return nil, "invalid response code: " .. res.status end @@ -201,18 +201,20 @@ local function choose_endpoint(self) return endpoints[1] end - --for _, endpoint in ipairs(endpoints) do - -- if checker.is_healthy(endpoint.http_host) then - -- return endpoint - -- end - --end + if health_check.conf ~= nil then + for _, endpoint in ipairs(endpoints) do + if health_check.is_healthy(endpoint.http_host) then + return endpoint + end + end + end self.init_count = (self.init_count or 0) + 1 local pos = self.init_count % endpoints_len + 1 if self.init_count >= INIT_COUNT_RESIZE then self.init_count = 0 end - ngx.log(ngx.WARN, "endpoints[pos]: ", require("resty.inspect")(endpoints[pos])) + return endpoints[pos] end @@ -554,7 +556,7 @@ local function request_chunk(self, endpoint, method, scheme, host, port, path, o ok, err = http_cli:connect(host, port) if not ok then - checker.report_fault(endpoint.http_host) + health_check.report_fault(endpoint.http_host) return nil, err end @@ -607,7 +609,7 @@ local function request_chunk(self, endpoint, method, scheme, host, port, path, o if not body then return nil, "failed to decode json body: " .. (err or " unkwon") elseif body.error and body.error.http_code >= 500 then - checker.report_fault(endpoint.http_host) + health_check.report_fault(endpoint.http_host) end if body.result.events then diff --git a/t/v3/health_check.t b/t/v3/health_check.t index 6044081a..a735d54e 100644 --- a/t/v3/health_check.t +++ b/t/v3/health_check.t @@ -34,11 +34,10 @@ __DATA__ content_by_lua_block { local health_check, err = require "resty.etcd.health_check" .new({ shm_name = "etcd_cluster_health_check", - fail_timeout = 10, - max_fails = 1, + fail_timeout = 5, + max_fails = 3, }) - ngx.log(ngx.WARN, "health_check: ", require("resty.inspect")(health_check)) local etcd, err = require "resty.etcd" .new({ protocol = "v3", http_host = { @@ -48,16 +47,40 @@ __DATA__ } }) local res, err = etcd:set("/health_check", { a='abc'}) - ngx.sleep(0.2) res, err = etcd:get("/health_check") res, err = etcd:get("/health_check") - ngx.log(ngx.WARN, "res.body.kvs[1].value: ", require("resty.inspect")(res.body.kvs[1].value)) + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") + res, err = etcd:get("/health_check") ngx.say("all down") } } --- request GET /t ---- timeout: 10 +--- timeout: 30 --- no_error_log [error] --- response_body From d9f69ba4d541afae8370d2c053d84a6a1e9d3fbc Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Wed, 6 Jan 2021 01:03:13 +0800 Subject: [PATCH 04/15] add test cases --- lib/resty/etcd/health_check.lua | 2 +- lib/resty/etcd/v3.lua | 16 +++- t/v3/health_check.t | 155 ++++++++++++++++++++++++-------- 3 files changed, 132 insertions(+), 41 deletions(-) diff --git a/lib/resty/etcd/health_check.lua b/lib/resty/etcd/health_check.lua index 98cc6f55..9f2e7d30 100644 --- a/lib/resty/etcd/health_check.lua +++ b/lib/resty/etcd/health_check.lua @@ -60,7 +60,7 @@ local function report_fault(etcd_host) if unhealthy_endpoint == nil then ngx_shared[conf.shm_name]:set(unhealthy_key, etcd_host, conf.fail_timeout) - utils.log_warn("update endpoint : ", etcd_host, " to unhealthy") + utils.log_warn("update endpoint: ", etcd_host, " to unhealthy") end end end diff --git a/lib/resty/etcd/v3.lua b/lib/resty/etcd/v3.lua index d5cca50b..383d546b 100644 --- a/lib/resty/etcd/v3.lua +++ b/lib/resty/etcd/v3.lua @@ -77,12 +77,16 @@ local function _request_uri(self, endpoint, method, uri, opts, timeout, ignore_a }) if err then - health_check.report_fault(endpoint.http_host) + if health_check.conf ~= nil then + health_check.report_fault(endpoint.http_host) + end return nil, err end if res.status >= 500 then - health_check.report_fault(endpoint.http_host) + if health_check.conf ~= nil then + health_check.report_fault(endpoint.http_host) + end return nil, "invalid response code: " .. res.status end @@ -556,7 +560,9 @@ local function request_chunk(self, endpoint, method, scheme, host, port, path, o ok, err = http_cli:connect(host, port) if not ok then - health_check.report_fault(endpoint.http_host) + if health_check.conf ~= nil then + health_check.report_fault(endpoint.http_host) + end return nil, err end @@ -609,7 +615,9 @@ local function request_chunk(self, endpoint, method, scheme, host, port, path, o if not body then return nil, "failed to decode json body: " .. (err or " unkwon") elseif body.error and body.error.http_code >= 500 then - health_check.report_fault(endpoint.http_host) + if health_check.conf ~= nil then + health_check.report_fault(endpoint.http_host) + end end if body.result.events then diff --git a/t/v3/health_check.t b/t/v3/health_check.t index a735d54e..3af5f917 100644 --- a/t/v3/health_check.t +++ b/t/v3/health_check.t @@ -19,8 +19,36 @@ if ($etcd_version =~ /^etcd Version: 2/ || $etcd_version =~ /^etcd Version: 3.1. our $HttpConfig = <<'_EOC_'; lua_socket_log_errors off; - lua_package_path 'lib/?.lua;/usr/local/share/lua/5.3/?.lua;/usr/share/lua/5.1/?.lua;/usr/local/lua-resty-etcd/deps/share/lua/5.1/?.lua;/usr/local/lua-resty-etcd/deps/share/lua/5.1/?/?.lua;;'; + lua_package_path 'lib/?.lua;/usr/local/share/lua/5.3/?.lua;/usr/share/lua/5.1/?.lua;;'; lua_shared_dict etcd_cluster_health_check 8m; + init_by_lua_block { + local cjson = require("cjson.safe") + + function check_res(data, err, val, status) + if err then + ngx.say("err: ", err) + ngx.exit(200) + end + + if val then + if data.body.kvs==nil then + ngx.exit(404) + end + if data.body.kvs and val ~= data.body.kvs[1].value then + ngx.say("failed to check value") + ngx.log(ngx.ERR, "failed to check value, got: ", data.body.kvs[1].value, + ", expect: ", val) + ngx.exit(200) + else + ngx.say("checked val as expect: ", val) + end + end + + if status and status ~= data.status then + ngx.exit(data.status) + end + end + } _EOC_ run_tests(); @@ -37,51 +65,106 @@ __DATA__ fail_timeout = 5, max_fails = 3, }) + assert( err == nil) + assert( health_check.conf ~= nil) - local etcd, err = require "resty.etcd" .new({ + local etcd, err = require "resty.etcd" .new({ protocol = "v3", http_host = { - "http://127.0.0.1:42379", + "http://127.0.0.1:12379", "http://127.0.0.1:22379", "http://127.0.0.1:32379", - } + }, + user = 'root', + password = 'abc123', + }) + check_res(etcd, err) + + ngx.say("done") + } + } +--- request +GET /t +--- no_error_log +[error] +--- response_body +done + + + +=== TEST 2: default configuration +--- http_config eval: $::HttpConfig +--- config + location /t { + content_by_lua_block { + local health_check, err = require "resty.etcd.health_check" .new({ + shm_name = "etcd_cluster_health_check", + }) + ngx.say(health_check.conf.max_fails) + ngx.say(health_check.conf.fail_timeout) + } + } +--- request +GET /t +--- response_body +1 +10 +--- no_error_log +[error] + + + +=== TEST 3: bad shm_name +--- http_config eval: $::HttpConfig +--- config + location /t { + content_by_lua_block { + local health_check, err = require "resty.etcd.health_check" .new({ + shm_name = "error_shm_name", }) - local res, err = etcd:set("/health_check", { a='abc'}) - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - res, err = etcd:get("/health_check") - ngx.say("all down") + ngx.say(err) } } --- request GET /t ---- timeout: 30 +--- response_body +failed to get ngx.shared dict: error_shm_name --- no_error_log [error] + + + +=== TEST 4: trigger unhealthy +--- http_config eval: $::HttpConfig +--- config + location /t { + content_by_lua_block { + local health_check, err = require "resty.etcd.health_check" .new({ + shm_name = "etcd_cluster_health_check", + fail_timeout = 10, + max_fails = 1, + }) + assert( err == nil) + assert( health_check.conf ~= nil) + + local etcd, err = require "resty.etcd" .new({ + protocol = "v3", + http_host = { + "http://127.0.0.1:42379", + "http://127.0.0.1:22379", + "http://127.0.0.1:32379", + }, + user = 'root', + password = 'abc123', + }) + + local res, err = etcd:set("/trigger_unhealthy", { a='abc'}) + ngx.say("done") + } + } +--- request +GET /t +--- error_log eval +qr/update endpoint: http:\/\/127.0.0.1:42379 to unhealthy/ --- response_body -all down +done From 069d22faec25783f72f044aa4486df52fd5c9373 Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Thu, 7 Jan 2021 01:23:53 +0800 Subject: [PATCH 05/15] add test cases --- lib/resty/etcd/v3.lua | 1 + t/v3/health_check.t | 216 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 215 insertions(+), 2 deletions(-) diff --git a/lib/resty/etcd/v3.lua b/lib/resty/etcd/v3.lua index 383d546b..77d94ce4 100644 --- a/lib/resty/etcd/v3.lua +++ b/lib/resty/etcd/v3.lua @@ -211,6 +211,7 @@ local function choose_endpoint(self) return endpoint end end + utils.log_warn("has no healthy endpoint") end self.init_count = (self.init_count or 0) + 1 diff --git a/t/v3/health_check.t b/t/v3/health_check.t index 3af5f917..453367b2 100644 --- a/t/v3/health_check.t +++ b/t/v3/health_check.t @@ -144,8 +144,6 @@ failed to get ngx.shared dict: error_shm_name fail_timeout = 10, max_fails = 1, }) - assert( err == nil) - assert( health_check.conf ~= nil) local etcd, err = require "resty.etcd" .new({ protocol = "v3", @@ -168,3 +166,217 @@ GET /t qr/update endpoint: http:\/\/127.0.0.1:42379 to unhealthy/ --- response_body done + + + +=== TEST 5: fault count +--- http_config eval: $::HttpConfig +--- config + location /t { + content_by_lua_block { + local health_check, err = require "resty.etcd.health_check" .new({ + shm_name = "etcd_cluster_health_check", + fail_timeout = 5, + max_fails = 3, + }) + + local etcd, err = require "resty.etcd" .new({ + protocol = "v3", + http_host = { + "http://127.0.0.1:42379", + "http://127.0.0.1:22379", + "http://127.0.0.1:32379", + }, + user = 'root', + password = 'abc123', + }) + + etcd:set("/fault_count", { a='abc'}) + etcd:set("/fault_count", { a='abc'}) + local fails, err = ngx.shared["etcd_cluster_health_check"]:get("http://127.0.0.1:42379") + if err then + ngx.say(err) + end + ngx.say(fails) + } + } +--- request +GET /t +--- response_body +2 +--- no_error_log +[error] + + + +=== TEST 6: check endpoint is healthy +--- http_config eval: $::HttpConfig +--- config + location /t { + content_by_lua_block { + local health_check, err = require "resty.etcd.health_check" .new({ + shm_name = "etcd_cluster_health_check", + fail_timeout = 3, + max_fails = 1, + }) + + local etcd, err = require "resty.etcd" .new({ + protocol = "v3", + http_host = { + "http://127.0.0.1:42379", + "http://127.0.0.1:22379", + "http://127.0.0.1:32379", + }, + user = 'root', + password = 'abc123', + }) + + etcd:set("/is_healthy", { a='abc'}) + + local healthy = health_check.is_healthy("http://127.0.0.1:42379") + ngx.say(healthy) + } + } +--- request +GET /t +--- response_body +false +--- no_error_log +[error] + + + +=== TEST 7: make sure `fail_timeout` works +--- http_config eval: $::HttpConfig +--- config + location /t { + content_by_lua_block { + local health_check, err = require "resty.etcd.health_check" .new({ + shm_name = "etcd_cluster_health_check", + fail_timeout = 2, + max_fails = 1, + }) + + local etcd, err = require "resty.etcd" .new({ + protocol = "v3", + http_host = { + "http://127.0.0.1:42379", + "http://127.0.0.1:22379", + "http://127.0.0.1:32379", + }, + user = 'root', + password = 'abc123', + }) + + local res, err = etcd:set("/fail_timeout", "http://127.0.0.1:42379") -- trigger http://127.0.0.1:42379 to unhealthy + + res, err = etcd:set("/fail_timeout", "http://127.0.0.1:22379") -- choose http://127.0.0.1:22379 to set value + res, err = etcd:get("/fail_timeout") + assert(res.body.kvs[1].value == "http://127.0.0.1:22379") + + ngx.sleep(2) + + res, err = etcd:set("/fail_timeout", "http://127.0.0.1:42379") -- choose http://127.0.0.1:42379 to set value + res, err = etcd:get("/fail_timeout") + assert(res.body.kvs[1].value == "http://127.0.0.1:22379") + + ngx.say("done") + } + } +--- request +GET /t +--- timeout: 5 +--- response_body +done +--- no_error_log +[error] + + + +=== TEST 8: has no healthy etcd endpoint, follow old style +--- http_config eval: $::HttpConfig +--- config + location /t { + content_by_lua_block { + local health_check, err = require "resty.etcd.health_check" .new({ + shm_name = "etcd_cluster_health_check", + fail_timeout = 3, + max_fails = 1, + }) + + local etcd, err = require "resty.etcd" .new({ + protocol = "v3", + http_host = { + "http://127.0.0.1:12379", + "http://127.0.0.1:22379", + "http://127.0.0.1:32379", + }, + user = 'root', + password = 'abc123', + }) + + health_check.report_fault("http://127.0.0.1:12379") + health_check.report_fault("http://127.0.0.1:22379") + health_check.report_fault("http://127.0.0.1:32379") + + local res, err = etcd:set("/no_healthy_endpoint", "hello") + check_res(etcd, err) + + ngx.say("done") + } + } +--- request +GET /t +--- response_body +done +--- error_log eval +qr/has no healthy endpoint/ + + + +=== TEST 9: `health_check` shared by different etcd clients +--- http_config eval: $::HttpConfig +--- config + location /t { + content_by_lua_block { + local health_check, err = require "resty.etcd.health_check" .new({ + shm_name = "etcd_cluster_health_check", + fail_timeout = 3, + max_fails = 2, + }) + + local etcd1, err = require "resty.etcd" .new({ + protocol = "v3", + http_host = { + "http://127.0.0.1:42379", + "http://127.0.0.1:22379", + "http://127.0.0.1:32379", + }, + user = 'root', + password = 'abc123', + }) + + local etcd2, err = require "resty.etcd" .new({ + protocol = "v3", + http_host = { + "http://127.0.0.1:42379", + "http://127.0.0.1:22379", + "http://127.0.0.1:32379", + }, + user = 'root', + password = 'abc123', + }) + + assert(tostring(etcd1) ~= tostring(etcd2)) + etcd1:set("/etcd1", "hello") + etcd2:set("/etcd2", "hello") + + ngx.say("done") + } + } +--- request +GET /t +--- response_body +done +--- error_log eval +qr/update endpoint: http:\/\/127.0.0.1:42379 to unhealthy/ From d0135a7a56f61820b531279546bc51f032aa8cad Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Fri, 8 Jan 2021 01:23:55 +0800 Subject: [PATCH 06/15] add makedown --- health_check.md | 66 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 health_check.md diff --git a/health_check.md b/health_check.md new file mode 100644 index 00000000..eaecf3d4 --- /dev/null +++ b/health_check.md @@ -0,0 +1,66 @@ +Etcd Cluster Health Check +======== + +Synopsis +======== + +```nginx +http { + # required declares a shared memory zone to store endpoints's health status + lua_shared_dict healthcheck_shm 1m; + + server { + location = /healthcheck { + content_by_lua_block { + # the health check feature is optional, and can be enabled with the following configuration. + local health_check, err = require "resty.etcd.health_check".new({ + shm_name = "healthcheck_shm", + fail_timeout = 10, + max_fails = 1, + }) + + local etcd, err = require "resty.etcd".new({ + protocol = "v3", + http_host = { + "http://127.0.0.1:12379", + "http://127.0.0.1:22379", + "http://127.0.0.1:32379", + }, + user = 'root', + password = 'abc123', + }) + } + } + } +} +``` + +Description +======== + +Implement a passive health check mechanism, when the connection/read/write fails occurs, recorded as an endpoint' failure. + +In a `fail_timeout`, if there are `max_fails` consecutive failures, the endpoint is marked as unhealthy, the unhealthy endpoint will not be choosed to connect for a `fail_timeout` time in the future. + +Health check mechanism would switch endpoint only when the previously choosed endpoint is marked as unhealthy. + +The failure counter and health status of each etcd endpoint are shared across workers and by different etcd clients. + +Config +======== + +The default configuration is as follows: + +```lua +health_check = { + shm_name = "healthcheck_shm", + fail_timeout = 10, + max_fails = 1, +} +``` + +- `shm_name`: the declarative `lua_shared_dict` is used to store the health status of endpoints. +- `fail_timeout`: sets the time during which a number of failed attempts must happen for the endpoint to be marked unavailable, and also the time for which the endpoint is marked unavailable(default is 10 seconds). +- `max_fails`: sets the number of failed attempts that must occur during the `fail_timeout` period for the endpoint to be marked unavailable (default is 1 attempt). + +Also note that the `fail_timeout` and `max_fails` of the health check cannot be changed once it has been created. From ad42834a2a33aa952bb8d07f3cbd6183ee08eaf2 Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Mon, 11 Jan 2021 01:22:43 +0800 Subject: [PATCH 07/15] add test case --- t/v3/health_check.t | 57 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/t/v3/health_check.t b/t/v3/health_check.t index 453367b2..f08a5abf 100644 --- a/t/v3/health_check.t +++ b/t/v3/health_check.t @@ -380,3 +380,60 @@ GET /t done --- error_log eval qr/update endpoint: http:\/\/127.0.0.1:42379 to unhealthy/ + + + +=== TEST 10: mock etcd error and report fault +--- http_config eval: $::HttpConfig +--- config + location /v3/auth/authenticate { --- mock normal authenticate response + content_by_lua_block { + ngx.print([[{ + body = '{"header":{"cluster_id":"17237436991929493444","member_id":"9372538179322589801","revision":"40","raft_term":"633"},"token":"KicnFPYazDaiMHBG.74"}', + reason = "OK", + status = 200 + }]]) + } + } + + location /v3/kv/put { + content_by_lua_block { --- mock abnormal put key response + ngx.print([[{ + body = '{"error":"etcdserver: request timed out","message":"etcdserver: request timed out","code":14}', + body_reader = , + reason = "Service Unavailable", + status = 503, + }]]) + } + } + + location /t { + content_by_lua_block { + local health_check, err = require "resty.etcd.health_check" .new({ + shm_name = "etcd_cluster_health_check", + fail_timeout = 10, + max_fails = 1, + }) + + local etcd, err = require "resty.etcd" .new({ + protocol = "v3", + http_host = { + "http://127.0.0.1:12379", + }, + user = 'root', + password = 'abc123', + }) + + etcd.endpoints[1].full_prefix="http://localhost:1984/v3" ---replace the endpoint with mock + etcd.endpoints[1].http_host="http://localhost:1984" + local res, err = etcd:set("/etcd_error", "hello") + local fails, err = ngx.shared["etcd_cluster_health_check"]:get("http://localhost:1984") + ngx.say(fails) + } + } +--- request +GET /t +--- response_body +1 +--- error_log eval +qr/update endpoint: http:\/\/localhost:1984 to unhealthy/ From c6f918b37282a2d0c90b688646b89bff26ea2a75 Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Mon, 11 Jan 2021 01:52:04 +0800 Subject: [PATCH 08/15] eak comments --- t/v3/health_check.t | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/t/v3/health_check.t b/t/v3/health_check.t index f08a5abf..39c1152f 100644 --- a/t/v3/health_check.t +++ b/t/v3/health_check.t @@ -386,8 +386,8 @@ qr/update endpoint: http:\/\/127.0.0.1:42379 to unhealthy/ === TEST 10: mock etcd error and report fault --- http_config eval: $::HttpConfig --- config - location /v3/auth/authenticate { --- mock normal authenticate response - content_by_lua_block { + location /v3/auth/authenticate { + content_by_lua_block { -- mock normal authenticate response ngx.print([[{ body = '{"header":{"cluster_id":"17237436991929493444","member_id":"9372538179322589801","revision":"40","raft_term":"633"},"token":"KicnFPYazDaiMHBG.74"}', reason = "OK", @@ -397,7 +397,7 @@ qr/update endpoint: http:\/\/127.0.0.1:42379 to unhealthy/ } location /v3/kv/put { - content_by_lua_block { --- mock abnormal put key response + content_by_lua_block { -- mock abnormal put key response ngx.print([[{ body = '{"error":"etcdserver: request timed out","message":"etcdserver: request timed out","code":14}', body_reader = , @@ -424,7 +424,7 @@ qr/update endpoint: http:\/\/127.0.0.1:42379 to unhealthy/ password = 'abc123', }) - etcd.endpoints[1].full_prefix="http://localhost:1984/v3" ---replace the endpoint with mock + etcd.endpoints[1].full_prefix="http://localhost:1984/v3" -- replace the endpoint with mock etcd.endpoints[1].http_host="http://localhost:1984" local res, err = etcd:set("/etcd_error", "hello") local fails, err = ngx.shared["etcd_cluster_health_check"]:get("http://localhost:1984") From b509472fa92d8c06869cc7b38120ecf618bc1931 Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Mon, 11 Jan 2021 23:15:45 +0800 Subject: [PATCH 09/15] resolved code review --- lib/resty/etcd/health_check.lua | 4 ++-- t/v3/health_check.t | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/resty/etcd/health_check.lua b/lib/resty/etcd/health_check.lua index 9f2e7d30..c841fa59 100644 --- a/lib/resty/etcd/health_check.lua +++ b/lib/resty/etcd/health_check.lua @@ -67,7 +67,7 @@ end _M.report_fault = report_fault -function _M.new(opts) +function _M.init(opts) if conf == nil then conf = {} local shared_dict = ngx_shared[opts.shm_name] @@ -82,4 +82,4 @@ function _M.new(opts) end end -return _M \ No newline at end of file +return _M diff --git a/t/v3/health_check.t b/t/v3/health_check.t index 39c1152f..1c3a9108 100644 --- a/t/v3/health_check.t +++ b/t/v3/health_check.t @@ -60,7 +60,7 @@ __DATA__ --- config location /t { content_by_lua_block { - local health_check, err = require "resty.etcd.health_check" .new({ + local health_check, err = require "resty.etcd.health_check" .init({ shm_name = "etcd_cluster_health_check", fail_timeout = 5, max_fails = 3, @@ -97,7 +97,7 @@ done --- config location /t { content_by_lua_block { - local health_check, err = require "resty.etcd.health_check" .new({ + local health_check, err = require "resty.etcd.health_check" .init({ shm_name = "etcd_cluster_health_check", }) ngx.say(health_check.conf.max_fails) @@ -119,7 +119,7 @@ GET /t --- config location /t { content_by_lua_block { - local health_check, err = require "resty.etcd.health_check" .new({ + local health_check, err = require "resty.etcd.health_check" .init({ shm_name = "error_shm_name", }) ngx.say(err) @@ -139,7 +139,7 @@ failed to get ngx.shared dict: error_shm_name --- config location /t { content_by_lua_block { - local health_check, err = require "resty.etcd.health_check" .new({ + local health_check, err = require "resty.etcd.health_check" .init({ shm_name = "etcd_cluster_health_check", fail_timeout = 10, max_fails = 1, @@ -174,7 +174,7 @@ done --- config location /t { content_by_lua_block { - local health_check, err = require "resty.etcd.health_check" .new({ + local health_check, err = require "resty.etcd.health_check" .init({ shm_name = "etcd_cluster_health_check", fail_timeout = 5, max_fails = 3, @@ -214,7 +214,7 @@ GET /t --- config location /t { content_by_lua_block { - local health_check, err = require "resty.etcd.health_check" .new({ + local health_check, err = require "resty.etcd.health_check" .init({ shm_name = "etcd_cluster_health_check", fail_timeout = 3, max_fails = 1, @@ -251,7 +251,7 @@ false --- config location /t { content_by_lua_block { - local health_check, err = require "resty.etcd.health_check" .new({ + local health_check, err = require "resty.etcd.health_check" .init({ shm_name = "etcd_cluster_health_check", fail_timeout = 2, max_fails = 1, @@ -298,7 +298,7 @@ done --- config location /t { content_by_lua_block { - local health_check, err = require "resty.etcd.health_check" .new({ + local health_check, err = require "resty.etcd.health_check" .init({ shm_name = "etcd_cluster_health_check", fail_timeout = 3, max_fails = 1, @@ -339,7 +339,7 @@ qr/has no healthy endpoint/ --- config location /t { content_by_lua_block { - local health_check, err = require "resty.etcd.health_check" .new({ + local health_check, err = require "resty.etcd.health_check" .init({ shm_name = "etcd_cluster_health_check", fail_timeout = 3, max_fails = 2, @@ -409,7 +409,7 @@ qr/update endpoint: http:\/\/127.0.0.1:42379 to unhealthy/ location /t { content_by_lua_block { - local health_check, err = require "resty.etcd.health_check" .new({ + local health_check, err = require "resty.etcd.health_check" .init({ shm_name = "etcd_cluster_health_check", fail_timeout = 10, max_fails = 1, From dd9a3f7b877eb50e7c93324a91d5a07f5fe693ea Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Tue, 12 Jan 2021 20:45:06 +0800 Subject: [PATCH 10/15] resolved code review --- health_check.md | 2 +- t/v3/health_check.t | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/health_check.md b/health_check.md index eaecf3d4..a6190c7d 100644 --- a/health_check.md +++ b/health_check.md @@ -13,7 +13,7 @@ http { location = /healthcheck { content_by_lua_block { # the health check feature is optional, and can be enabled with the following configuration. - local health_check, err = require "resty.etcd.health_check".new({ + local health_check, err = require "resty.etcd.health_check".init({ shm_name = "healthcheck_shm", fail_timeout = 10, max_fails = 1, diff --git a/t/v3/health_check.t b/t/v3/health_check.t index 1c3a9108..9626d097 100644 --- a/t/v3/health_check.t +++ b/t/v3/health_check.t @@ -400,7 +400,6 @@ qr/update endpoint: http:\/\/127.0.0.1:42379 to unhealthy/ content_by_lua_block { -- mock abnormal put key response ngx.print([[{ body = '{"error":"etcdserver: request timed out","message":"etcdserver: request timed out","code":14}', - body_reader = , reason = "Service Unavailable", status = 503, }]]) From e418d6140ee9858d5b3c923b8eca1bd3bd4c2ae6 Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Wed, 13 Jan 2021 23:29:22 +0800 Subject: [PATCH 11/15] resolved code review --- health_check.md | 41 +++++++++---------- rockspec/lua-resty-etcd-master-0.1-0.rockspec | 1 + 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/health_check.md b/health_check.md index a6190c7d..b469d27b 100644 --- a/health_check.md +++ b/health_check.md @@ -1,8 +1,6 @@ -Etcd Cluster Health Check -======== +# Etcd Cluster Health Check -Synopsis -======== +## Synopsis ```nginx http { @@ -13,13 +11,13 @@ http { location = /healthcheck { content_by_lua_block { # the health check feature is optional, and can be enabled with the following configuration. - local health_check, err = require "resty.etcd.health_check".init({ + local health_check, err = require("resty.etcd.health_check").init({ shm_name = "healthcheck_shm", fail_timeout = 10, max_fails = 1, }) - local etcd, err = require "resty.etcd".new({ + local etcd, err = require("resty.etcd").new({ protocol = "v3", http_host = { "http://127.0.0.1:12379", @@ -35,32 +33,31 @@ http { } ``` -Description -======== - +## Description Implement a passive health check mechanism, when the connection/read/write fails occurs, recorded as an endpoint' failure. -In a `fail_timeout`, if there are `max_fails` consecutive failures, the endpoint is marked as unhealthy, the unhealthy endpoint will not be choosed to connect for a `fail_timeout` time in the future. - -Health check mechanism would switch endpoint only when the previously choosed endpoint is marked as unhealthy. - -The failure counter and health status of each etcd endpoint are shared across workers and by different etcd clients. +## Config -Config -======== +| name | Type | Requirement | Default | Description | +| ------------ | ------- | ----------- | ------- | ------------------------------------------------------------ | +| shm_name | string | required | | the declarative `lua_shared_dict` is used to store the health status of endpoints. | +| fail_timeout | integer | optional | 10 | sets the time during which a number of failed attempts must happen for the endpoint to be marked unavailable, and also the time for which the endpoint is marked unavailable. | +| max_fails | integer | optional | 1 | ets the number of failed attempts that must occur during the `fail_timeout` period for the endpoint to be marked unavailable. | -The default configuration is as follows: +lua example: ```lua -health_check = { +local health_check, err = require("resty.etcd.health_check").init({ shm_name = "healthcheck_shm", fail_timeout = 10, max_fails = 1, -} +}) ``` -- `shm_name`: the declarative `lua_shared_dict` is used to store the health status of endpoints. -- `fail_timeout`: sets the time during which a number of failed attempts must happen for the endpoint to be marked unavailable, and also the time for which the endpoint is marked unavailable(default is 10 seconds). -- `max_fails`: sets the number of failed attempts that must occur during the `fail_timeout` period for the endpoint to be marked unavailable (default is 1 attempt). +In a `fail_timeout`, if there are `max_fails` consecutive failures, the endpoint is marked as unhealthy, the unhealthy endpoint will not be choosed to connect for a `fail_timeout` time in the future. + +Health check mechanism would switch endpoint only when the previously choosed endpoint is marked as unhealthy. + +The failure counter and health status of each etcd endpoint are shared across workers and by different etcd clients. Also note that the `fail_timeout` and `max_fails` of the health check cannot be changed once it has been created. diff --git a/rockspec/lua-resty-etcd-master-0.1-0.rockspec b/rockspec/lua-resty-etcd-master-0.1-0.rockspec index 37b94df0..22b78776 100644 --- a/rockspec/lua-resty-etcd-master-0.1-0.rockspec +++ b/rockspec/lua-resty-etcd-master-0.1-0.rockspec @@ -26,5 +26,6 @@ build = { ["resty.etcd.utils"] = "lib/resty/etcd/utils.lua", ["resty.etcd.serializers.json"] = "lib/resty/etcd/serializers/json.lua", ["resty.etcd.serializers.raw"] = "lib/resty/etcd/serializers/raw.lua", + ["resty.etcd.health_check"] = "lib/resty/etcd/health_check.lua", } } From 9f62e4cbadf9e357386623123bce2388381862b5 Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Wed, 13 Jan 2021 23:31:52 +0800 Subject: [PATCH 12/15] resolved code review --- health_check.md | 1 + 1 file changed, 1 insertion(+) diff --git a/health_check.md b/health_check.md index b469d27b..721b7ce1 100644 --- a/health_check.md +++ b/health_check.md @@ -34,6 +34,7 @@ http { ``` ## Description + Implement a passive health check mechanism, when the connection/read/write fails occurs, recorded as an endpoint' failure. ## Config From 0390de8443585f2ce7717be558b0db1d71149fef Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Mon, 18 Jan 2021 00:23:21 +0800 Subject: [PATCH 13/15] resolved code review --- health_check.md | 24 ++++++++++++++++++++++++ lib/resty/etcd/health_check.lua | 8 ++++---- lib/resty/etcd/v3.lua | 10 +++++----- t/v3/health_check.t | 10 +++++----- 4 files changed, 38 insertions(+), 14 deletions(-) diff --git a/health_check.md b/health_check.md index 721b7ce1..19d8efb0 100644 --- a/health_check.md +++ b/health_check.md @@ -37,6 +37,30 @@ http { Implement a passive health check mechanism, when the connection/read/write fails occurs, recorded as an endpoint' failure. +## Methods + +* [init](#init) +* [report_failure](#report_failure) +* [get_target_status](#get_target_status) + +### init + +`syntax: health_check, err = health_check.init(params)` + +Initializes the health check object, overring default params with the ones given. In case of failures, returns `nil` and a string describing the error. + +### report_failure + +`syntax: health_check.report_failure(etcd_host)` + +Report a health failure. Reports a health failure which will count against the number of occurrences required to make a target "fall". + +### get_target_status + +`syntax: healthy, err = health_check.get_target_status(etcd_host)` + +Get the current status of the target. + ## Config | name | Type | Requirement | Default | Description | diff --git a/lib/resty/etcd/health_check.lua b/lib/resty/etcd/health_check.lua index c841fa59..35a5a47f 100644 --- a/lib/resty/etcd/health_check.lua +++ b/lib/resty/etcd/health_check.lua @@ -8,7 +8,7 @@ local function gen_unhealthy_key(etcd_host) return "unhealthy-" .. etcd_host end -local function is_healthy(etcd_host) +local function get_target_status(etcd_host) if conf == nil then return end @@ -27,7 +27,7 @@ local function is_healthy(etcd_host) return false end -_M.is_healthy = is_healthy +_M.get_target_status = get_target_status local function fault_count(key, shm_name, fail_timeout) @@ -43,7 +43,7 @@ local function fault_count(key, shm_name, fail_timeout) end -local function report_fault(etcd_host) +local function report_failure(etcd_host) if conf == nil then return end @@ -64,7 +64,7 @@ local function report_fault(etcd_host) end end end -_M.report_fault = report_fault +_M.report_failure = report_failure function _M.init(opts) diff --git a/lib/resty/etcd/v3.lua b/lib/resty/etcd/v3.lua index 77d94ce4..8ac3224a 100644 --- a/lib/resty/etcd/v3.lua +++ b/lib/resty/etcd/v3.lua @@ -78,14 +78,14 @@ local function _request_uri(self, endpoint, method, uri, opts, timeout, ignore_a if err then if health_check.conf ~= nil then - health_check.report_fault(endpoint.http_host) + health_check.report_failure(endpoint.http_host) end return nil, err end if res.status >= 500 then if health_check.conf ~= nil then - health_check.report_fault(endpoint.http_host) + health_check.report_failure(endpoint.http_host) end return nil, "invalid response code: " .. res.status end @@ -207,7 +207,7 @@ local function choose_endpoint(self) if health_check.conf ~= nil then for _, endpoint in ipairs(endpoints) do - if health_check.is_healthy(endpoint.http_host) then + if health_check.get_target_status(endpoint.http_host) then return endpoint end end @@ -562,7 +562,7 @@ local function request_chunk(self, endpoint, method, scheme, host, port, path, o ok, err = http_cli:connect(host, port) if not ok then if health_check.conf ~= nil then - health_check.report_fault(endpoint.http_host) + health_check.report_failure(endpoint.http_host) end return nil, err end @@ -617,7 +617,7 @@ local function request_chunk(self, endpoint, method, scheme, host, port, path, o return nil, "failed to decode json body: " .. (err or " unkwon") elseif body.error and body.error.http_code >= 500 then if health_check.conf ~= nil then - health_check.report_fault(endpoint.http_host) + health_check.report_failure(endpoint.http_host) end end diff --git a/t/v3/health_check.t b/t/v3/health_check.t index 9626d097..504849e9 100644 --- a/t/v3/health_check.t +++ b/t/v3/health_check.t @@ -231,9 +231,9 @@ GET /t password = 'abc123', }) - etcd:set("/is_healthy", { a='abc'}) + etcd:set("/get_target_status", { a='abc'}) - local healthy = health_check.is_healthy("http://127.0.0.1:42379") + local healthy = health_check.get_target_status("http://127.0.0.1:42379") ngx.say(healthy) } } @@ -315,9 +315,9 @@ done password = 'abc123', }) - health_check.report_fault("http://127.0.0.1:12379") - health_check.report_fault("http://127.0.0.1:22379") - health_check.report_fault("http://127.0.0.1:32379") + health_check.report_failure("http://127.0.0.1:12379") + health_check.report_failure("http://127.0.0.1:22379") + health_check.report_failure("http://127.0.0.1:32379") local res, err = etcd:set("/no_healthy_endpoint", "hello") check_res(etcd, err) From e75896bda86f4d368a6eb7c9808b59a61059a25d Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Mon, 18 Jan 2021 16:30:13 +0800 Subject: [PATCH 14/15] resolve code review --- health_check.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/health_check.md b/health_check.md index 19d8efb0..561a3027 100644 --- a/health_check.md +++ b/health_check.md @@ -45,9 +45,9 @@ Implement a passive health check mechanism, when the connection/read/write fails ### init -`syntax: health_check, err = health_check.init(params)` +`syntax: health_check, err = health_check.init(conf)` -Initializes the health check object, overring default params with the ones given. In case of failures, returns `nil` and a string describing the error. +Initializes the health check object, overring default config with the ones given. In case of failures, returns `nil` and a string describing the error. ### report_failure From 876b5bf82a3fca060d8862304c27ee804361d339 Mon Sep 17 00:00:00 2001 From: tzssangglass Date: Fri, 22 Jan 2021 00:59:12 +0800 Subject: [PATCH 15/15] resolved code review --- health_check.md | 76 ++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/health_check.md b/health_check.md index 19d8efb0..396ac3b3 100644 --- a/health_check.md +++ b/health_check.md @@ -1,41 +1,8 @@ # Etcd Cluster Health Check -## Synopsis - -```nginx -http { - # required declares a shared memory zone to store endpoints's health status - lua_shared_dict healthcheck_shm 1m; - - server { - location = /healthcheck { - content_by_lua_block { - # the health check feature is optional, and can be enabled with the following configuration. - local health_check, err = require("resty.etcd.health_check").init({ - shm_name = "healthcheck_shm", - fail_timeout = 10, - max_fails = 1, - }) - - local etcd, err = require("resty.etcd").new({ - protocol = "v3", - http_host = { - "http://127.0.0.1:12379", - "http://127.0.0.1:22379", - "http://127.0.0.1:32379", - }, - user = 'root', - password = 'abc123', - }) - } - } - } -} -``` - ## Description -Implement a passive health check mechanism, when the connection/read/write fails occurs, recorded as an endpoint' failure. +Implement a passive health check mechanism, that when the connection/read/write fails, record it as an endpoint's failure. ## Methods @@ -47,13 +14,13 @@ Implement a passive health check mechanism, when the connection/read/write fails `syntax: health_check, err = health_check.init(params)` -Initializes the health check object, overring default params with the ones given. In case of failures, returns `nil` and a string describing the error. +Initializes the health check object, overiding default params with the given ones. In case of failures, returns `nil` and a string describing the error. ### report_failure `syntax: health_check.report_failure(etcd_host)` -Report a health failure. Reports a health failure which will count against the number of occurrences required to make a target "fall". +Reports a health failure which will count against the number of occurrences required to make a target "fail". ### get_target_status @@ -66,8 +33,8 @@ Get the current status of the target. | name | Type | Requirement | Default | Description | | ------------ | ------- | ----------- | ------- | ------------------------------------------------------------ | | shm_name | string | required | | the declarative `lua_shared_dict` is used to store the health status of endpoints. | -| fail_timeout | integer | optional | 10 | sets the time during which a number of failed attempts must happen for the endpoint to be marked unavailable, and also the time for which the endpoint is marked unavailable. | -| max_fails | integer | optional | 1 | ets the number of failed attempts that must occur during the `fail_timeout` period for the endpoint to be marked unavailable. | +| fail_timeout | integer | optional | 10s | sets the time during which the specified number of unsuccessful attempts to communicate with the endpoint should happen to marker the endpoint unavailable, and also sets the period of time the endpoint will be marked unavailable. | +| max_fails | integer | optional | 1 | sets the number of failed attempts that must occur during the `fail_timeout` period for the endpoint to be marked unavailable. | lua example: @@ -86,3 +53,36 @@ Health check mechanism would switch endpoint only when the previously choosed en The failure counter and health status of each etcd endpoint are shared across workers and by different etcd clients. Also note that the `fail_timeout` and `max_fails` of the health check cannot be changed once it has been created. + +## Synopsis + +```nginx +http { + # required declares a shared memory zone to store endpoints's health status + lua_shared_dict healthcheck_shm 1m; + + server { + location = /healthcheck { + content_by_lua_block { + # the health check feature is optional, and can be enabled with the following configuration. + local health_check, err = require("resty.etcd.health_check").init({ + shm_name = "healthcheck_shm", + fail_timeout = 10, + max_fails = 1, + }) + + local etcd, err = require("resty.etcd").new({ + protocol = "v3", + http_host = { + "http://127.0.0.1:12379", + "http://127.0.0.1:22379", + "http://127.0.0.1:32379", + }, + user = 'root', + password = 'abc123', + }) + } + } + } +} +```